Store precomputed KE for -m 22100 in shared memory and lock the loops per kernel invocation to a fixed value

2025-04-17 15:56:20 +00:00 · 2020-01-01 20:48:55 +01:00 · 2020-01-01 20:48:55 +01:00 · 311d363054
commit 311d363054
parent db5decb750
2 changed files with 124 additions and 54 deletions
--- a/OpenCL/m22100-pure.cl
+++ b/OpenCL/m22100-pure.cl
@ -25,7 +25,7 @@ typedef struct bitlocker
  u32 type;
  u32 iv[4];
  u32 data[15];
-  u32 wb_ke_pc[ITERATION_BITLOCKER][64]; // only 48 needed
+  u32 wb_ke_pc[ITERATION_BITLOCKER][48];

 } bitlocker_t;

@ -36,7 +36,13 @@ typedef struct bitlocker_tmp

 } bitlocker_tmp_t;

-DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, u32x *digest, const GLOBAL_AS u32 wb_ke_pc[64])
+#ifdef REAL_SHM
+#define SHM_TYPE2 LOCAL_AS
+#else
+#define SHM_TYPE2 GLOBAL_AS
+#endif
+
+DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, u32x *digest, SHM_TYPE2 u32 s_wb_ke_pc[48])
 {
  u32x a = digest[0];
  u32x b = digest[1];
@ -64,24 +70,24 @@ DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const
  u32x we_t = w3[2];
  u32x wf_t = w3[3];

-  #define ROUND_EXPAND_PC(i)  \
-  {                           \
-    w0_t = wb_ke_pc[i +  0];  \
-    w1_t = wb_ke_pc[i +  1];  \
-    w2_t = wb_ke_pc[i +  2];  \
-    w3_t = wb_ke_pc[i +  3];  \
-    w4_t = wb_ke_pc[i +  4];  \
-    w5_t = wb_ke_pc[i +  5];  \
-    w6_t = wb_ke_pc[i +  6];  \
-    w7_t = wb_ke_pc[i +  7];  \
-    w8_t = wb_ke_pc[i +  8];  \
-    w9_t = wb_ke_pc[i +  9];  \
-    wa_t = wb_ke_pc[i + 10];  \
-    wb_t = wb_ke_pc[i + 11];  \
-    wc_t = wb_ke_pc[i + 12];  \
-    wd_t = wb_ke_pc[i + 13];  \
-    we_t = wb_ke_pc[i + 14];  \
-    wf_t = wb_ke_pc[i + 15];  \
+  #define ROUND_EXPAND_PC(i)    \
+  {                             \
+    w0_t = s_wb_ke_pc[i +  0];  \
+    w1_t = s_wb_ke_pc[i +  1];  \
+    w2_t = s_wb_ke_pc[i +  2];  \
+    w3_t = s_wb_ke_pc[i +  3];  \
+    w4_t = s_wb_ke_pc[i +  4];  \
+    w5_t = s_wb_ke_pc[i +  5];  \
+    w6_t = s_wb_ke_pc[i +  6];  \
+    w7_t = s_wb_ke_pc[i +  7];  \
+    w8_t = s_wb_ke_pc[i +  8];  \
+    w9_t = s_wb_ke_pc[i +  9];  \
+    wa_t = s_wb_ke_pc[i + 10];  \
+    wb_t = s_wb_ke_pc[i + 11];  \
+    wc_t = s_wb_ke_pc[i + 12];  \
+    wd_t = s_wb_ke_pc[i + 13];  \
+    we_t = s_wb_ke_pc[i + 14];  \
+    wf_t = s_wb_ke_pc[i + 15];  \
  }

  #define ROUND_STEP(i)                                                                   \
@ -104,12 +110,14 @@ DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const
    SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \
  }

+  ROUND_STEP (0);
+
  #ifdef _unroll
  #pragma unroll
  #endif
-  for (int i = 0; i < 64; i += 16)
+  for (int i = 16; i < 64; i += 16)
  {
-    ROUND_EXPAND_PC (i); ROUND_STEP (i);
+    ROUND_EXPAND_PC (i - 16); ROUND_STEP (i);
  }

  #undef ROUND_EXPAND_PC
@ -188,9 +196,60 @@ KERNEL_FQ void m22100_init (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
 KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
 {
  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * load 256 full w[] precomputed KE buffers into shared memory since its all static data
+   * in order for this to work we need to set a fixed loop count to 256
+   */
+
+  #ifdef REAL_SHM
+
+  LOCAL_VK u32 s_wb_ke_pc[256][48];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    for (u32 j = 0; j < 48; j++) // first 16 set to register
+    {
+      s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + i][j];
+    }
+  }
+
+  SYNC_THREADS ();
+
+  #else
+
+  GLOBAL_AS u32 (*s_wb_ke_pc)[48] = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos];
+
+  #endif

  if ((gid * VECT_SIZE) >= gid_max) return;

+  // salt to register
+
+  u32x t0[4];
+  u32x t1[4];
+  u32x t2[4];
+  u32x t3[4];
+
+  t0[0] = salt_bufs[salt_pos].salt_buf[0];
+  t0[1] = salt_bufs[salt_pos].salt_buf[1];
+  t0[2] = salt_bufs[salt_pos].salt_buf[2];
+  t0[3] = salt_bufs[salt_pos].salt_buf[3];
+  t1[0] = 0;
+  t1[1] = 0;
+  t1[2] = 0x80000000;
+  t1[3] = 0;
+  t2[0] = 0;
+  t2[1] = 0;
+  t2[2] = 0;
+  t2[3] = 0;
+  t3[0] = 0;
+  t3[1] = 0;
+  t3[2] = 0;
+  t3[3] = 88 * 8;
+
  // init

  u32x w0[4];
@ -230,8 +289,11 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
    digest[6] = SHA256M_G;
    digest[7] = SHA256M_H;

-    sha256_transform_vector    (w0, w1, w2, w3, digest);
-    sha256_transform_vector_pc (w0, w1, w2, w3, digest, esalt_bufs[digests_offset].wb_ke_pc[j]);
+    sha256_transform_vector (w0, w1, w2, w3, digest);
+
+    t1[0] = hc_swap32_S (j); // only moving part
+
+    sha256_transform_vector_pc (t0, t1, t2, t3, digest, s_wb_ke_pc[i]);

    w0[0] = digest[0];
    w0[1] = digest[1];
--- a/src/modules/module_22100.c
+++ b/src/modules/module_22100.c
@ -51,7 +51,7 @@ typedef struct bitlocker
  u32 type;
  u32 iv[4];
  u32 data[15];
-  u32 wb_ke_pc[ITERATION_BITLOCKER][64]; // only 48 needed
+  u32 wb_ke_pc[ITERATION_BITLOCKER][48];

 } bitlocker_t;

@ -78,11 +78,18 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
  return tmp_size;
 }

-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 256;
+  const u32 kernel_loops_min = 256;

-  return kernel_threads_max;
+  return kernel_loops_min;
+}
+
+u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_max = 256;
+
+  return kernel_loops_max;
 }

 u32 module_pw_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -210,34 +217,35 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE

  for (int i = 0; i < ITERATION_BITLOCKER; i++)
  {
-    bitlocker->wb_ke_pc[i][ 0] = salt->salt_buf[0];
-    bitlocker->wb_ke_pc[i][ 1] = salt->salt_buf[1];
-    bitlocker->wb_ke_pc[i][ 2] = salt->salt_buf[2];
-    bitlocker->wb_ke_pc[i][ 3] = salt->salt_buf[3];
-    bitlocker->wb_ke_pc[i][ 4] = byte_swap_32 (i);
-    bitlocker->wb_ke_pc[i][ 5] = 0;
-    bitlocker->wb_ke_pc[i][ 6] = 0x80000000;
-    bitlocker->wb_ke_pc[i][ 7] = 0;
-    bitlocker->wb_ke_pc[i][ 8] = 0;
-    bitlocker->wb_ke_pc[i][ 9] = 0;
-    bitlocker->wb_ke_pc[i][10] = 0;
-    bitlocker->wb_ke_pc[i][11] = 0;
-    bitlocker->wb_ke_pc[i][12] = 0;
-    bitlocker->wb_ke_pc[i][13] = 0;
-    bitlocker->wb_ke_pc[i][14] = 0;
-    bitlocker->wb_ke_pc[i][15] = 88 * 8;
+    u32 tmp[64];
+
+    tmp[ 0] = salt->salt_buf[0];
+    tmp[ 1] = salt->salt_buf[1];
+    tmp[ 2] = salt->salt_buf[2];
+    tmp[ 3] = salt->salt_buf[3];
+    tmp[ 4] = byte_swap_32 (i);
+    tmp[ 5] = 0;
+    tmp[ 6] = 0x80000000;
+    tmp[ 7] = 0;
+    tmp[ 8] = 0;
+    tmp[ 9] = 0;
+    tmp[10] = 0;
+    tmp[11] = 0;
+    tmp[12] = 0;
+    tmp[13] = 0;
+    tmp[14] = 0;
+    tmp[15] = 88 * 8;

    #define hc_rotl32_S rotl32

    for (int j = 16; j < 64; j++)
    {
-      bitlocker->wb_ke_pc[i][j] = SHA256_EXPAND_S
-      (
-        bitlocker->wb_ke_pc[i][j -  2],
-        bitlocker->wb_ke_pc[i][j -  7],
-        bitlocker->wb_ke_pc[i][j - 15],
-        bitlocker->wb_ke_pc[i][j - 16]
-      );
+      tmp[j] = SHA256_EXPAND_S (tmp[j - 2], tmp[j - 7], tmp[j - 15], tmp[j - 16]);
+    }
+
+    for (int j = 0; j < 48; j++)
+    {
+      bitlocker->wb_ke_pc[i][j] = tmp[16 + j];
    }
  }

@ -423,9 +431,9 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;