Replace sha1_update_swap() with sha1_update()

Move hc_bytealign_be_S() to kernel Remove invalid use of NEW_SIMD_CODE Replace hc_bytealign_be_S() with hc_bytealign_le_S() Change zero strategy in memzero_le_S()
3 years ago · 76facb56cf
parent af40ec0640
commit 76facb56cf
2 changed files with 90 additions and 48 deletions
--- a/OpenCL/m17010-pure.cl
+++ b/OpenCL/m17010-pure.cl
@ -3,7 +3,7 @@
 * License.....: MIT
 */

-#define NEW_SIMD_CODE
+//#define NEW_SIMD_CODE

 #ifdef KERNEL_STATIC
 #include "inc_vendor.h"
@ -18,9 +18,9 @@ typedef struct gpg
 {
  u32 cipher_algo;
  u32 iv[4];
-  u32 modulus_size; 
+  u32 modulus_size;
  u32 encrypted_data[384];
-  u32 encrypted_data_size; 
+  u32 encrypted_data_size;

 } gpg_t;

@ -42,31 +42,58 @@ typedef struct gpg_tmp

 } gpg_tmp_t;

-DECLSPEC void memcat_be_S (u32 *block, const u32 offset, const u32 *append, u32 len)
+
+DECLSPEC u32 hc_bytealign_le_S (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC void memcat_le_S (u32 *block, const u32 offset, const u32 *append, u32 len)
 {
  const u32 start_index = (offset - 1) >> 2;
  const u32 count = ((offset + len + 3) >> 2) - start_index;
  const int off_mod_4 = offset & 3;
  const int off_minus_4 = 4 - off_mod_4;

-  block[start_index] |= hc_bytealign_be_S (append[0], 0, off_minus_4);
+  block[start_index] |= hc_bytealign_le_S (append[0], 0, off_minus_4);

  for (u32 idx = 1; idx < count; idx++)
  {
-    block[start_index + idx] = hc_bytealign_be_S (append[idx], append[idx - 1], off_minus_4);
+    block[start_index + idx] = hc_bytealign_le_S (append[idx], append[idx - 1], off_minus_4);
  }
 }

-DECLSPEC void memzero_be_S (u32 *block, const u32 start_offset, const u32 end_offset)
+DECLSPEC void memzero_le_S (u32 *block, const u32 start_offset, const u32 end_offset)
 {
-  const u32 start_idx = (start_offset + 3) / 4;
+  const u32 start_idx = start_offset / 4;
+
+  // zero out bytes in the first u32 starting from 'start_offset'
+  block[start_idx] &= 0xffffffff >> ((4 - (start_offset & 3)) * 8);
+
  const u32 end_idx = (end_offset + 3) / 4;

+  // zero out bytes in u32 units -- note that the last u32 is completely zeroed!
+  for (u32 i = start_idx + 1; i < end_idx; i++)
+  {
+    block[i] = 0;
+  }
+}
+
+DECLSPEC void memzero_be_S (u32 *block, const u32 start_offset, const u32 end_offset)
+{
+  const u32 start_idx = start_offset / 4;
+
  // zero out bytes in the first u32 starting from 'start_offset'
-   block[start_idx - 1] &= 0xffffffff >> (((4 - start_offset) & 3) * 8);
+  block[start_idx] &= 0xffffffff << ((4 - (start_offset & 3)) * 8);
+
+  const u32 end_idx = (end_offset + 3) / 4;

  // zero out bytes in u32 units -- note that the last u32 is completely zeroed!
-  for (u32 i = start_idx; i < end_idx; i++)
+  for (u32 i = start_idx + 1; i < end_idx; i++)
  {
    block[i] = 0;
  }
@ -143,13 +170,14 @@ DECLSPEC int check_decoded_data (u32 *decoded_data, const u32 decoded_data_size)
  const u32 sha1_u32_off = sha1_byte_off / 4;

  u32 expected_sha1[5];
-  expected_sha1[0] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 1], decoded_data[sha1_u32_off + 0], sha1_byte_off);
-  expected_sha1[1] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 2], decoded_data[sha1_u32_off + 1], sha1_byte_off);
-  expected_sha1[2] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 3], decoded_data[sha1_u32_off + 2], sha1_byte_off);
-  expected_sha1[3] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 4], decoded_data[sha1_u32_off + 3], sha1_byte_off);
-  expected_sha1[4] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 5], decoded_data[sha1_u32_off + 4], sha1_byte_off);

-  memzero_be_S (decoded_data, sha1_byte_off, 384 * sizeof(u32));
+  expected_sha1[0] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 1], decoded_data[sha1_u32_off + 0], sha1_byte_off);
+  expected_sha1[1] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 2], decoded_data[sha1_u32_off + 1], sha1_byte_off);
+  expected_sha1[2] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 3], decoded_data[sha1_u32_off + 2], sha1_byte_off);
+  expected_sha1[3] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 4], decoded_data[sha1_u32_off + 3], sha1_byte_off);
+  expected_sha1[4] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 5], decoded_data[sha1_u32_off + 4], sha1_byte_off);
+
+  memzero_le_S (decoded_data, sha1_byte_off, 384 * sizeof(u32));

  sha1_ctx_t ctx;

@ -188,14 +216,31 @@ KERNEL_FQ void m17010_init (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))

  // create a number of copies for efficiency
  const u32 copies = 80 * sizeof(u32) / salted_pw_len;
+
  for (u32 idx = 1; idx < copies; idx++)
  {
-    memcat_be_S (salted_pw_block, idx * salted_pw_len, salted_pw_block, salted_pw_len);
+    memcat_le_S (salted_pw_block, idx * salted_pw_len, salted_pw_block, salted_pw_len);
  }

-  for (u32 idx = 0; idx < 80; idx++) tmps[gid].salted_pw_block[idx] = salted_pw_block[idx];
+  for (u32 idx = 0; idx < 80; idx++)
+  {
+    tmps[gid].salted_pw_block[idx] = hc_swap32_S (salted_pw_block[idx]);
+  }

  tmps[gid].salted_pw_block_len = (copies * salted_pw_len);
+
+  tmps[gid].h[0] = SHA1M_A;
+  tmps[gid].h[1] = SHA1M_B;
+  tmps[gid].h[2] = SHA1M_C;
+  tmps[gid].h[3] = SHA1M_D;
+  tmps[gid].h[4] = SHA1M_E;
+  tmps[gid].h[5] = SHA1M_A;
+  tmps[gid].h[6] = SHA1M_B;
+  tmps[gid].h[7] = SHA1M_C;
+  tmps[gid].h[8] = SHA1M_D;
+  tmps[gid].h[9] = SHA1M_E;
+
+  tmps[gid].len = 0;
 }

 KERNEL_FQ void m17010_loop_prepare (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
@ -204,31 +249,24 @@ KERNEL_FQ void m17010_loop_prepare (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))

  if (gid >= gid_max) return;

-  /**
-   * context save
-   */
-
-  sha1_ctx_t ctx;
-
-  sha1_init (&ctx);
-
-  // padd with one or more zeroes for larger target key sizes, e.g. for AES-256
-  if (salt_repeat > 0)
-  {
-    u32 zeroes[16] = {0};
-
-    sha1_update (&ctx, zeroes, salt_repeat);
-  }
-
-  const u32 sha_offset = salt_repeat * 5;
-
-  for (int i = 0; i < 5; i++) tmps[gid].h[sha_offset + i] = ctx.h[i];
-  for (int i = 0; i < 4; i++) tmps[gid].w0[i] = ctx.w0[i];
-  for (int i = 0; i < 4; i++) tmps[gid].w1[i] = ctx.w1[i];
-  for (int i = 0; i < 4; i++) tmps[gid].w2[i] = ctx.w2[i];
-  for (int i = 0; i < 4; i++) tmps[gid].w3[i] = ctx.w3[i];
-
-  tmps[gid].len = ctx.len;
+  tmps[gid].w0[0] = 0;
+  tmps[gid].w0[1] = 0;
+  tmps[gid].w0[2] = 0;
+  tmps[gid].w0[3] = 0;
+  tmps[gid].w1[0] = 0;
+  tmps[gid].w1[1] = 0;
+  tmps[gid].w1[2] = 0;
+  tmps[gid].w1[3] = 0;
+  tmps[gid].w2[0] = 0;
+  tmps[gid].w2[1] = 0;
+  tmps[gid].w2[2] = 0;
+  tmps[gid].w2[3] = 0;
+  tmps[gid].w3[0] = 0;
+  tmps[gid].w3[1] = 0;
+  tmps[gid].w3[2] = 0;
+  tmps[gid].w3[3] = 0;
+
+  tmps[gid].len = salt_repeat;
 }

 KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
@ -236,12 +274,15 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
  const u64 gid = get_global_id (0);

  if (gid >= gid_max) return;
- 
+
  // get the prepared buffer from the gpg_tmp_t struct into a local buffer
  u32 salted_pw_block[80];
+
  for (int i = 0; i < 80; i++) salted_pw_block[i] = tmps[gid].salted_pw_block[i];

  const u32 salted_pw_block_len = tmps[gid].salted_pw_block_len;
+
+  // do we really need this, since the salt is always length 8?
  if (salted_pw_block_len == 0) return;

  /**
@ -253,6 +294,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
  const u32 sha_offset = salt_repeat * 5;

  for (int i = 0; i < 5; i++) ctx.h[i] = tmps[gid].h[sha_offset + i];
+
  for (int i = 0; i < 4; i++) ctx.w0[i] = tmps[gid].w0[i];
  for (int i = 0; i < 4; i++) ctx.w1[i] = tmps[gid].w1[i];
  for (int i = 0; i < 4; i++) ctx.w2[i] = tmps[gid].w2[i];
@ -268,7 +310,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))

  for (u32 i = 0; i < rounds; i++)
  {
-    sha1_update_swap (&ctx, salted_pw_block, salted_pw_block_len);
+    sha1_update (&ctx, salted_pw_block, salted_pw_block_len);
  }

  if ((loop_pos + loop_cnt) == salt_iter)
@ -279,7 +321,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
    {
      memzero_be_S (salted_pw_block, remaining_bytes, salted_pw_block_len);

-      sha1_update_swap (&ctx, salted_pw_block, remaining_bytes);
+      sha1_update (&ctx, salted_pw_block, remaining_bytes);
    }

    sha1_final (&ctx);
@ -290,6 +332,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
   */

  for (int i = 0; i < 5; i++) tmps[gid].h[sha_offset + i] = ctx.h[i];
+
  for (int i = 0; i < 4; i++) tmps[gid].w0[i] = ctx.w0[i];
  for (int i = 0; i < 4; i++) tmps[gid].w1[i] = ctx.w1[i];
  for (int i = 0; i < 4; i++) tmps[gid].w2[i] = ctx.w2[i];
--- a/src/modules/module_17010.c
+++ b/src/modules/module_17010.c
@ -19,8 +19,7 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH;
 static const char *HASH_NAME      = "GPG (AES-128/AES-256 (SHA-1($pass)))";
 static const u64   KERN_TYPE      = 17010;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                  | OPTS_TYPE_LOOP_PREPARE
                                  | OPTS_TYPE_AUX1