From 349b3c46736726b367c10eafe8eb004edbab98a4 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 2 Jan 2020 11:59:37 +0100 Subject: [PATCH] Fix Bitlocker in OpenCL mode on NV --- OpenCL/m22100-pure.cl | 156 +++++++++++++++++++++++++++++------------- 1 file changed, 110 insertions(+), 46 deletions(-) diff --git a/OpenCL/m22100-pure.cl b/OpenCL/m22100-pure.cl index 270cc52ef..bfb4b7908 100644 --- a/OpenCL/m22100-pure.cl +++ b/OpenCL/m22100-pure.cl @@ -199,32 +199,29 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t)) const u64 lid = get_local_id (0); const u64 lsz = get_local_size (0); - /** - * load 256 full w[] precomputed KE buffers into shared memory since its all static data - * in order for this to work we need to set a fixed loop count to 256 - */ + // init - #ifdef REAL_SHM + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; - LOCAL_VK u32 s_wb_ke_pc[256][48]; - - for (u32 i = lid; i < 256; i += lsz) - { - for (u32 j = 0; j < 48; j++) // first 16 set to register - { - s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + i][j]; - } - } - - SYNC_THREADS (); - - #else - - GLOBAL_AS u32 (*s_wb_ke_pc)[48] = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos]; - - #endif - - if ((gid * VECT_SIZE) >= gid_max) return; + w0[0] = packv (tmps, last_hash, gid, 0); // last_hash + w0[1] = packv (tmps, last_hash, gid, 1); + w0[2] = packv (tmps, last_hash, gid, 2); + w0[3] = packv (tmps, last_hash, gid, 3); + w1[0] = packv (tmps, last_hash, gid, 4); + w1[1] = packv (tmps, last_hash, gid, 5); + w1[2] = packv (tmps, last_hash, gid, 6); + w1[3] = packv (tmps, last_hash, gid, 7); + w2[0] = packv (tmps, init_hash, gid, 0); // init_hash + w2[1] = packv (tmps, init_hash, gid, 1); + w2[2] = packv (tmps, init_hash, gid, 2); + w2[3] = packv (tmps, init_hash, gid, 3); + w3[0] = packv (tmps, init_hash, gid, 4); + w3[1] = packv (tmps, init_hash, gid, 5); + w3[2] = packv (tmps, init_hash, gid, 6); + w3[3] = packv (tmps, init_hash, gid, 7); // salt to register @@ -250,33 +247,46 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t)) t3[2] = 0; t3[3] = 88 * 8; - // init + /** + * load FIXED_ITER_COUNT full w[] precomputed KE buffers into shared memory since its all static data + * in order for this to work we need to set a fixed loop count to FIXED_ITER_COUNT + * We also need to handle OpenCL and CUDA differently because of: + * ptxas error : Entry function 'm22100_loop' uses too much shared data (0xc004 bytes, 0xc000 max) + */ - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + #ifdef IS_CUDA + #define FIXED_ITER_COUNT 256 + #else + #define FIXED_ITER_COUNT 128 + #endif - w0[0] = packv (tmps, last_hash, gid, 0); // last_hash - w0[1] = packv (tmps, last_hash, gid, 1); - w0[2] = packv (tmps, last_hash, gid, 2); - w0[3] = packv (tmps, last_hash, gid, 3); - w1[0] = packv (tmps, last_hash, gid, 4); - w1[1] = packv (tmps, last_hash, gid, 5); - w1[2] = packv (tmps, last_hash, gid, 6); - w1[3] = packv (tmps, last_hash, gid, 7); - w2[0] = packv (tmps, init_hash, gid, 0); // init_hash - w2[1] = packv (tmps, init_hash, gid, 1); - w2[2] = packv (tmps, init_hash, gid, 2); - w2[3] = packv (tmps, init_hash, gid, 3); - w3[0] = packv (tmps, init_hash, gid, 4); - w3[1] = packv (tmps, init_hash, gid, 5); - w3[2] = packv (tmps, init_hash, gid, 6); - w3[3] = packv (tmps, init_hash, gid, 7); + #ifdef REAL_SHM + LOCAL_VK u32 s_wb_ke_pc[FIXED_ITER_COUNT][48]; + #else + GLOBAL_AS u32 (*s_wb_ke_pc)[48] = NULL; + #endif + + #ifdef REAL_SHM + + for (u32 i = lid; i < FIXED_ITER_COUNT; i += lsz) + { + for (u32 j = 0; j < 48; j++) // first 16 set to register + { + s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + i][j]; + } + } + + SYNC_THREADS (); + + #else + + s_wb_ke_pc = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos]; + + #endif // main loop - for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++) + for (u32 i = 0, j = loop_pos; i < FIXED_ITER_COUNT; i++, j++) { u32x digest[8]; @@ -305,6 +315,60 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t)) w1[3] = digest[7]; } + #ifdef IS_CUDA + // nothing to do + #else + // remaining 128 iterations for non-cuda devices + #ifdef REAL_SHM + + for (u32 i = lid; i < FIXED_ITER_COUNT; i += lsz) + { + for (u32 j = 0; j < 48; j++) // first 16 set to register + { + s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + 128 + i][j]; + } + } + + SYNC_THREADS (); + + #else + + s_wb_ke_pc = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos + 128]; + + #endif + + // main loop + + for (u32 i = 0, j = loop_pos + 128; i < FIXED_ITER_COUNT; i++, j++) + { + u32x digest[8]; + + digest[0] = SHA256M_A; + digest[1] = SHA256M_B; + digest[2] = SHA256M_C; + digest[3] = SHA256M_D; + digest[4] = SHA256M_E; + digest[5] = SHA256M_F; + digest[6] = SHA256M_G; + digest[7] = SHA256M_H; + + sha256_transform_vector (w0, w1, w2, w3, digest); + + t1[0] = hc_swap32_S (j); // only moving part + + sha256_transform_vector_pc (t0, t1, t2, t3, digest, s_wb_ke_pc[i]); + + w0[0] = digest[0]; + w0[1] = digest[1]; + w0[2] = digest[2]; + w0[3] = digest[3]; + w1[0] = digest[4]; + w1[1] = digest[5]; + w1[2] = digest[6]; + w1[3] = digest[7]; + } + #endif + unpackv (tmps, last_hash, gid, 0, w0[0]); unpackv (tmps, last_hash, gid, 1, w0[1]); unpackv (tmps, last_hash, gid, 2, w0[2]);