From 7f419c68aff2006376b0f2bc6225eededc62d772 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Mon, 26 Jul 2021 13:38:39 +0200 Subject: [PATCH] Blowfish Kernels: Backport optimizations reducing bank conflicts from bcrypt to Password Safe v2 and Open Document Format (ODF) 1.1 --- OpenCL/m18600-pure.cl | 118 ++++++++++++++++++++++++++++--------- docs/changes.txt | 2 +- src/modules/module_18600.c | 63 ++++++++++++++++---- 3 files changed, 141 insertions(+), 42 deletions(-) diff --git a/OpenCL/m18600-pure.cl b/OpenCL/m18600-pure.cl index 061c61bd8..f98aca9c0 100644 --- a/OpenCL/m18600-pure.cl +++ b/OpenCL/m18600-pure.cl @@ -319,6 +319,51 @@ CONSTANT_VK u32a c_pbox[18] = 0x9216d5d9, 0x8979fb1b }; +// Yes, works only with CUDA atm + +#ifdef DYNAMIC_LOCAL +#define BCRYPT_AVOID_BANK_CONFLICTS +#endif + +#ifdef BCRYPT_AVOID_BANK_CONFLICTS + +// access pattern: minimize bank ID based on thread ID but thread ID is not saved from computation + +#define KEY32(lid,key) (((key) * FIXED_LOCAL_SIZE_COMP) + (lid)) + +DECLSPEC u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + const u64 lid = get_local_id (0); + + return S[KEY32 (lid, key)]; +} + +DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + const u64 lid = get_local_id (0); + + S[KEY32 (lid, key)] = val; +} + +#undef KEY32 + +#else + +// access pattern: linear access with S offset already set to right offset based on thread ID saving it from compuation +// makes sense if there are not thread ID's (for instance on CPU) + +DECLSPEC inline u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + return S[key]; +} + +DECLSPEC inline void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + S[key] = val; +} + +#endif + #define BF_ROUND(L,R,N) \ { \ u32 tmp; \ @@ -328,10 +373,10 @@ CONSTANT_VK u32a c_pbox[18] = const u32 r2 = unpack_v8b_from_v32_S ((L)); \ const u32 r3 = unpack_v8a_from_v32_S ((L)); \ \ - tmp = S0[r0]; \ - tmp += S1[r1]; \ - tmp ^= S2[r2]; \ - tmp += S3[r3]; \ + tmp = GET_KEY32 (S0, r0); \ + tmp += GET_KEY32 (S1, r1); \ + tmp ^= GET_KEY32 (S2, r2); \ + tmp += GET_KEY32 (S3, r3); \ \ (R) ^= tmp ^ P[(N)]; \ } @@ -366,6 +411,10 @@ CONSTANT_VK u32a c_pbox[18] = L ^= P[17]; \ } +#ifdef DYNAMIC_LOCAL +extern __shared__ u32 S[]; +#endif + DECLSPEC void hmac_sha1_run_V (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *ipad, u32x *opad, u32x *digest) { digest[0] = ipad[0]; @@ -586,7 +635,7 @@ KERNEL_FQ void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t)) } } -KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t)) +KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t)) { const u64 gid = get_global_id (0); const u64 lid = get_local_id (0); @@ -616,22 +665,33 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_ P[i] = c_pbox[i] ^ ukey[i % 4]; } - LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; - LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; - LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; - LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #ifdef DYNAMIC_LOCAL + // from host + #else + LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE_COMP][256]; + LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE_COMP][256]; + LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE_COMP][256]; + LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE_COMP][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; LOCAL_AS u32 *S3 = S3_all[lid]; + #endif for (u32 i = 0; i < 256; i++) { - S0[i] = c_sbox0[i]; - S1[i] = c_sbox1[i]; - S2[i] = c_sbox2[i]; - S3[i] = c_sbox3[i]; + SET_KEY32 (S0, i, c_sbox0[i]); + SET_KEY32 (S1, i, c_sbox1[i]); + SET_KEY32 (S2, i, c_sbox2[i]); + SET_KEY32 (S3, i, c_sbox3[i]); } u32 L0 = 0; @@ -649,52 +709,52 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_ { BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); BF_ENCRYPT (L0, R0); - S0[i + 2] = L0; - S0[i + 3] = R0; + SET_KEY32 (S0, i + 2, L0); + SET_KEY32 (S0, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) { BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); BF_ENCRYPT (L0, R0); - S1[i + 2] = L0; - S1[i + 3] = R0; + SET_KEY32 (S1, i + 2, L0); + SET_KEY32 (S1, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) { BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); BF_ENCRYPT (L0, R0); - S2[i + 2] = L0; - S2[i + 3] = R0; + SET_KEY32 (S2, i + 2, L0); + SET_KEY32 (S2, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) { BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); BF_ENCRYPT (L0, R0); - S3[i + 2] = L0; - S3[i + 3] = R0; + SET_KEY32 (S3, i + 2, L0); + SET_KEY32 (S3, i + 3, R0); } GLOBAL_AS const odf11_t *es = &esalt_bufs[DIGESTS_OFFSET]; diff --git a/docs/changes.txt b/docs/changes.txt index 69c6c1ad0..cc14c475e 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -20,9 +20,9 @@ - AMD GPUs: Add inline assembly code for md5crypt/sha256crypt, PDF 1.7, 7-Zip, RAR3, Samsung Android and Windows Phone 8+ - Blake Kernels: Optimize BLAKE2B_ROUND() 64 bit rotates giving a 5% performance increase +- Blowfish Kernels: Backport optimizations reducing bank conflicts from bcrypt to Password Safe v2 and Open Document Format (ODF) 1.1 - Brain Session: Adds hashconfig specific opti_type and opts_type parameters to hashcat session computation to cover features like -O and -M - Kernel Threads: Use warp size / wavefront size query instead of hardcoded values as base for kernel threads -- Password Safe v2: Backport optimizations reducing bank conflicts in bcrypt - Shared Memory: Calculate kernel dynamic memory size based on CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN - Slow Kernels: Set some of the slowest kernels to OPTS_TYPE_MP_MULTI_DISABLE diff --git a/src/modules/module_18600.c b/src/modules/module_18600.c index c24ea37fe..8f2d05f4c 100644 --- a/src/modules/module_18600.c +++ b/src/modules/module_18600.c @@ -21,7 +21,8 @@ static const char *HASH_NAME = "Open Document Format (ODF) 1.1 (SHA-1, Blow static const u64 KERN_TYPE = 18600; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE | OPTI_TYPE_SLOW_HASH_SIMD_LOOP; -static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE; +static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_DYNAMIC_SHARED; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; static const char *ST_HASH = "$odf$*0*0*1024*16*bff753835f4ea15644b8a2f8e4b5be3d147b9576*8*ee371da34333b69d*16*a902eff54a4d782a26a899a31f97bef4*0*dae7e41fbc3a500d3ce152edd8876c4f38fb17d673ee2ac44ef1e0e283622cd2ae298a82d8d98f2ea737247881fc353e73a2f535c6e13e0cdc60821c1a61c53a4b0c46ff3a3b355d7b793fad50de15999fc7c1194321d1c54316c3806956c4a3ade7daabb912a2a36398eba883af088b3cb69b43365d9ba9fce3fb0c1524f73947a7e9fc1bf3adb5f85a367035feacb5d97c578b037144c2793f34aa09dcd04bdaa455aee0d4c52fe377248611dd56f2bd4eb294673525db905f5d905a28dec0909348e6bf94bcebf03ddd61a48797cd5728ce6dbb71037b268f526e806401abcf495f6edd0b5d87118671ec690d4627f86a43e51c7f6d42a75a56eec51204d47e115e813ed4425c97b16b195e02ce776c185194b9de43ae89f356e29face016cb393d6fb93af8ea305d921d5592dd184051ac790b9b90266f52b8d53ce1cb1d762942d6d5bbd0e3821be21af9fa6874ba0c60e64f41d3e5b6caca1c53b575afdc5d8f6a3edbf874dbe009c6cb296466fe9637aed4aed8a43a95ea7d26b4090ad33d4ee7a83844b0893e8bc0f04944205fb9576cb5720f019028cd75ca9ac47b3e5fa231354d74135564df43b659cfaea7e195c4a896e0e0e0c85dc9ce3a9ce9ba552bc2a6dbac4901c19558818e1957ed72d78662bb5ba53475ca584371f1825ae0c92322a4404e63c2baad92665aac29b5c6f96e1e6338d48fb0aef4d0b686063974f58b839484f8dcf0a02537cba67a7d2c4de13125d74820cb07ec72782035af1ea6c4db61c77016d1c021b63c8b07adb4e8510f5c41bbc501f60f3dd16462399b52eb146787e38e700147c7aa23ac4d5d22d9d1c93e67a01c92a197d4765cbf8d56a862a1205abb450a182913a69b8d5334a59924f86fb3ccd0dcfe7426053e26ba26b57c05f38d85863fff1f81135b0366e8cd8680663ae8aaf7d005317b849d5e08be882708fa0d8d02d47e89150124b507c34845c922b95e62aa0b3fef218773d7aeb572c67b35ad8787f31ecc6e1846b673b8ba6172223176eabf0020b6aa3aa71405b40b2fc2127bf9741a103f1d8eca21bf27328cdf15153f2f223eff7b831a72ed8ecacf4ea8df4ea44f3a3921e5a88fb2cfa355ece0f05cbc88fdd1ecd368d6e3b2dfabd999e5b708f1bccaeebb296c9d7b76659967742fe966aa6871cbbffe710b0cd838c6e02e6eb608cb5c81d066b60b5b3604396331d97d4a2c4c2317406e48c9f5387a2c72511d1e6899bd450e9ca88d535755bcfddb53a6df118cd9cdc7d8b4b814f7bc17684d8e5975defaa25d06f410ed0724c16b8f69ec3869bc1f05c71483666968d1c04509875dadd72c6182733d564eb1a7d555dc34f6b817c5418626214d0b2c3901c5a46f5b20fddfdf9f71a7dfd75b9928778a3f65e1832dff22be973c2b259744d500a3027c2a2e08972eaaad4c5c4ec871"; @@ -66,16 +67,25 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY { char *jit_build_options = NULL; + // this mode heavily depends on the available shared memory size + // note the kernel need to have some special code changes in order to make use to use post-48k memory region + // we need to set some macros + + bool use_dynamic = false; + + if (device_param->is_cuda == true) + { + use_dynamic = true; + } + // this uses some nice feedback effect. // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result. // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1. - u32 fixed_local_size = 0; - if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) { - fixed_local_size = 1; + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE_COMP=%u", 1); } else { @@ -91,29 +101,58 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY if (device_param->is_opencl == true) { - overhead = 4; + overhead = 1; } } if (user_options->kernel_threads_chgd == true) { - fixed_local_size = user_options->kernel_threads; + u32 fixed_local_size = user_options->kernel_threads; + + if (use_dynamic == true) + { + if ((fixed_local_size * 4096) > device_param->kernel_dynamic_local_mem_size_memset) + { + // otherwise out-of-bound reads - // otherwise out-of-bound reads + fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096; + } - if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead)) + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE_COMP=%u -D DYNAMIC_LOCAL", fixed_local_size); + } + else { - fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead)) + { + // otherwise out-of-bound reads + + fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + } + + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE_COMP=%u", fixed_local_size); } } else { - fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + if (use_dynamic == true) + { + // using kernel_dynamic_local_mem_size_memset is a bit hackish. + // we had to brute-force this value out of an already loaded CUDA function. + // there's no official way to query for this value. + + const u32 fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096; + + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE_COMP=%u -D DYNAMIC_LOCAL", fixed_local_size); + } + else + { + const u32 fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE_COMP=%u", fixed_local_size); + } } } - hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); - return jit_build_options; }