From 311d36305452e88fdbdbbcbad3d50a73357ec3b3 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Wed, 1 Jan 2020 20:48:55 +0100 Subject: [PATCH] Store precomputed KE for -m 22100 in shared memory and lock the loops per kernel invocation to a fixed value --- OpenCL/m22100-pure.cl | 110 +++++++++++++++++++++++++++++-------- src/modules/module_22100.c | 68 +++++++++++++---------- 2 files changed, 124 insertions(+), 54 deletions(-) diff --git a/OpenCL/m22100-pure.cl b/OpenCL/m22100-pure.cl index c279c3dde..270cc52ef 100644 --- a/OpenCL/m22100-pure.cl +++ b/OpenCL/m22100-pure.cl @@ -25,7 +25,7 @@ typedef struct bitlocker u32 type; u32 iv[4]; u32 data[15]; - u32 wb_ke_pc[ITERATION_BITLOCKER][64]; // only 48 needed + u32 wb_ke_pc[ITERATION_BITLOCKER][48]; } bitlocker_t; @@ -36,7 +36,13 @@ typedef struct bitlocker_tmp } bitlocker_tmp_t; -DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, u32x *digest, const GLOBAL_AS u32 wb_ke_pc[64]) +#ifdef REAL_SHM +#define SHM_TYPE2 LOCAL_AS +#else +#define SHM_TYPE2 GLOBAL_AS +#endif + +DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, u32x *digest, SHM_TYPE2 u32 s_wb_ke_pc[48]) { u32x a = digest[0]; u32x b = digest[1]; @@ -64,24 +70,24 @@ DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const u32x we_t = w3[2]; u32x wf_t = w3[3]; - #define ROUND_EXPAND_PC(i) \ - { \ - w0_t = wb_ke_pc[i + 0]; \ - w1_t = wb_ke_pc[i + 1]; \ - w2_t = wb_ke_pc[i + 2]; \ - w3_t = wb_ke_pc[i + 3]; \ - w4_t = wb_ke_pc[i + 4]; \ - w5_t = wb_ke_pc[i + 5]; \ - w6_t = wb_ke_pc[i + 6]; \ - w7_t = wb_ke_pc[i + 7]; \ - w8_t = wb_ke_pc[i + 8]; \ - w9_t = wb_ke_pc[i + 9]; \ - wa_t = wb_ke_pc[i + 10]; \ - wb_t = wb_ke_pc[i + 11]; \ - wc_t = wb_ke_pc[i + 12]; \ - wd_t = wb_ke_pc[i + 13]; \ - we_t = wb_ke_pc[i + 14]; \ - wf_t = wb_ke_pc[i + 15]; \ + #define ROUND_EXPAND_PC(i) \ + { \ + w0_t = s_wb_ke_pc[i + 0]; \ + w1_t = s_wb_ke_pc[i + 1]; \ + w2_t = s_wb_ke_pc[i + 2]; \ + w3_t = s_wb_ke_pc[i + 3]; \ + w4_t = s_wb_ke_pc[i + 4]; \ + w5_t = s_wb_ke_pc[i + 5]; \ + w6_t = s_wb_ke_pc[i + 6]; \ + w7_t = s_wb_ke_pc[i + 7]; \ + w8_t = s_wb_ke_pc[i + 8]; \ + w9_t = s_wb_ke_pc[i + 9]; \ + wa_t = s_wb_ke_pc[i + 10]; \ + wb_t = s_wb_ke_pc[i + 11]; \ + wc_t = s_wb_ke_pc[i + 12]; \ + wd_t = s_wb_ke_pc[i + 13]; \ + we_t = s_wb_ke_pc[i + 14]; \ + wf_t = s_wb_ke_pc[i + 15]; \ } #define ROUND_STEP(i) \ @@ -104,12 +110,14 @@ DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ } + ROUND_STEP (0); + #ifdef _unroll #pragma unroll #endif - for (int i = 0; i < 64; i += 16) + for (int i = 16; i < 64; i += 16) { - ROUND_EXPAND_PC (i); ROUND_STEP (i); + ROUND_EXPAND_PC (i - 16); ROUND_STEP (i); } #undef ROUND_EXPAND_PC @@ -188,9 +196,60 @@ KERNEL_FQ void m22100_init (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t)) KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t)) { const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + const u64 lsz = get_local_size (0); + + /** + * load 256 full w[] precomputed KE buffers into shared memory since its all static data + * in order for this to work we need to set a fixed loop count to 256 + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_wb_ke_pc[256][48]; + + for (u32 i = lid; i < 256; i += lsz) + { + for (u32 j = 0; j < 48; j++) // first 16 set to register + { + s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + i][j]; + } + } + + SYNC_THREADS (); + + #else + + GLOBAL_AS u32 (*s_wb_ke_pc)[48] = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos]; + + #endif if ((gid * VECT_SIZE) >= gid_max) return; + // salt to register + + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = salt_bufs[salt_pos].salt_buf[0]; + t0[1] = salt_bufs[salt_pos].salt_buf[1]; + t0[2] = salt_bufs[salt_pos].salt_buf[2]; + t0[3] = salt_bufs[salt_pos].salt_buf[3]; + t1[0] = 0; + t1[1] = 0; + t1[2] = 0x80000000; + t1[3] = 0; + t2[0] = 0; + t2[1] = 0; + t2[2] = 0; + t2[3] = 0; + t3[0] = 0; + t3[1] = 0; + t3[2] = 0; + t3[3] = 88 * 8; + // init u32x w0[4]; @@ -230,8 +289,11 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t)) digest[6] = SHA256M_G; digest[7] = SHA256M_H; - sha256_transform_vector (w0, w1, w2, w3, digest); - sha256_transform_vector_pc (w0, w1, w2, w3, digest, esalt_bufs[digests_offset].wb_ke_pc[j]); + sha256_transform_vector (w0, w1, w2, w3, digest); + + t1[0] = hc_swap32_S (j); // only moving part + + sha256_transform_vector_pc (t0, t1, t2, t3, digest, s_wb_ke_pc[i]); w0[0] = digest[0]; w0[1] = digest[1]; diff --git a/src/modules/module_22100.c b/src/modules/module_22100.c index 82d008100..bdf3d667a 100644 --- a/src/modules/module_22100.c +++ b/src/modules/module_22100.c @@ -51,7 +51,7 @@ typedef struct bitlocker u32 type; u32 iv[4]; u32 data[15]; - u32 wb_ke_pc[ITERATION_BITLOCKER][64]; // only 48 needed + u32 wb_ke_pc[ITERATION_BITLOCKER][48]; } bitlocker_t; @@ -78,11 +78,18 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 256; + const u32 kernel_loops_min = 256; - return kernel_threads_max; + return kernel_loops_min; +} + +u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + const u32 kernel_loops_max = 256; + + return kernel_loops_max; } u32 module_pw_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) @@ -210,34 +217,35 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE for (int i = 0; i < ITERATION_BITLOCKER; i++) { - bitlocker->wb_ke_pc[i][ 0] = salt->salt_buf[0]; - bitlocker->wb_ke_pc[i][ 1] = salt->salt_buf[1]; - bitlocker->wb_ke_pc[i][ 2] = salt->salt_buf[2]; - bitlocker->wb_ke_pc[i][ 3] = salt->salt_buf[3]; - bitlocker->wb_ke_pc[i][ 4] = byte_swap_32 (i); - bitlocker->wb_ke_pc[i][ 5] = 0; - bitlocker->wb_ke_pc[i][ 6] = 0x80000000; - bitlocker->wb_ke_pc[i][ 7] = 0; - bitlocker->wb_ke_pc[i][ 8] = 0; - bitlocker->wb_ke_pc[i][ 9] = 0; - bitlocker->wb_ke_pc[i][10] = 0; - bitlocker->wb_ke_pc[i][11] = 0; - bitlocker->wb_ke_pc[i][12] = 0; - bitlocker->wb_ke_pc[i][13] = 0; - bitlocker->wb_ke_pc[i][14] = 0; - bitlocker->wb_ke_pc[i][15] = 88 * 8; + u32 tmp[64]; + + tmp[ 0] = salt->salt_buf[0]; + tmp[ 1] = salt->salt_buf[1]; + tmp[ 2] = salt->salt_buf[2]; + tmp[ 3] = salt->salt_buf[3]; + tmp[ 4] = byte_swap_32 (i); + tmp[ 5] = 0; + tmp[ 6] = 0x80000000; + tmp[ 7] = 0; + tmp[ 8] = 0; + tmp[ 9] = 0; + tmp[10] = 0; + tmp[11] = 0; + tmp[12] = 0; + tmp[13] = 0; + tmp[14] = 0; + tmp[15] = 88 * 8; #define hc_rotl32_S rotl32 for (int j = 16; j < 64; j++) { - bitlocker->wb_ke_pc[i][j] = SHA256_EXPAND_S - ( - bitlocker->wb_ke_pc[i][j - 2], - bitlocker->wb_ke_pc[i][j - 7], - bitlocker->wb_ke_pc[i][j - 15], - bitlocker->wb_ke_pc[i][j - 16] - ); + tmp[j] = SHA256_EXPAND_S (tmp[j - 2], tmp[j - 7], tmp[j - 15], tmp[j - 16]); + } + + for (int j = 0; j < 48; j++) + { + bitlocker->wb_ke_pc[i][j] = tmp[16 + j]; } } @@ -423,9 +431,9 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_jit_cache_disable = MODULE_DEFAULT; module_ctx->module_kernel_accel_max = MODULE_DEFAULT; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; - module_ctx->module_kernel_loops_max = MODULE_DEFAULT; - module_ctx->module_kernel_loops_min = MODULE_DEFAULT; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_loops_max = module_kernel_loops_max; + module_ctx->module_kernel_loops_min = module_kernel_loops_min; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT;