From ce8c121b5000ccc94357a4e052d6330b50fb44d8 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Tue, 1 Jun 2021 22:52:07 +0200 Subject: [PATCH] BCRYPT Kernels: Improved bcrypt performance by 6.5% for high-end NVIDIA GPU devices using CUDA backend --- OpenCL/m03200-pure.cl | 195 ++++++++++++++++++++++++------------- OpenCL/m25600-pure.cl | 195 ++++++++++++++++++++++++------------- OpenCL/m25800-pure.cl | 195 ++++++++++++++++++++++++------------- src/backend.c | 14 +-- src/modules/module_03200.c | 9 +- src/modules/module_25600.c | 9 +- src/modules/module_25800.c | 9 +- 7 files changed, 388 insertions(+), 238 deletions(-) diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl index 98c54441a..d58beee27 100644 --- a/OpenCL/m03200-pure.cl +++ b/OpenCL/m03200-pure.cl @@ -309,6 +309,51 @@ CONSTANT_VK u32a c_pbox[18] = 0x9216d5d9, 0x8979fb1b }; +// Yes, works only with CUDA atm + +#ifdef DYNAMIC_LOCAL +#define BCRYPT_AVOID_BANK_CONFLICTS +#endif + +#ifdef BCRYPT_AVOID_BANK_CONFLICTS + +// access pattern: minimize bank ID based on thread ID but thread ID is not saved from computation + +#define KEY32(lid,key) (((key) * FIXED_LOCAL_SIZE) + (lid)) + +DECLSPEC u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + const u64 lid = get_local_id (0); + + return S[KEY32 (lid, key)]; +} + +DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + const u64 lid = get_local_id (0); + + S[KEY32 (lid, key)] = val; +} + +#undef KEY32 + +#else + +// access pattern: linear access with S offset already set to right offset based on thread ID saving it from compuation +// makes sense if there are not thread ID's (for instance on CPU) + +DECLSPEC inline u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + return S[key]; +} + +DECLSPEC inline void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + S[key] = val; +} + +#endif + #define BF_ROUND(L,R,N) \ { \ u32 tmp; \ @@ -318,10 +363,10 @@ CONSTANT_VK u32a c_pbox[18] = const u32 r2 = unpack_v8b_from_v32_S ((L)); \ const u32 r3 = unpack_v8a_from_v32_S ((L)); \ \ - tmp = S0[r0]; \ - tmp += S1[r1]; \ - tmp ^= S2[r2]; \ - tmp += S3[r3]; \ + tmp = GET_KEY32 (S0, r0); \ + tmp += GET_KEY32 (S1, r1); \ + tmp ^= GET_KEY32 (S2, r2); \ + tmp += GET_KEY32 (S3, r3); \ \ (R) ^= tmp ^ P[(N)]; \ } @@ -357,7 +402,7 @@ CONSTANT_VK u32a c_pbox[18] = } #ifdef DYNAMIC_LOCAL -extern __shared__ u32 lm[]; +extern __shared__ u32 S[]; #endif DECLSPEC void expand_key (u32 *E, u32 *W, const int len) @@ -461,16 +506,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -479,10 +528,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = c_sbox0[i]; - S1[i] = c_sbox1[i]; - S2[i] = c_sbox2[i]; - S3[i] = c_sbox3[i]; + SET_KEY32 (S0, i, c_sbox0[i]); + SET_KEY32 (S1, i, c_sbox1[i]); + SET_KEY32 (S2, i, c_sbox2[i]); + SET_KEY32 (S3, i, c_sbox3[i]); } // expandstate @@ -513,16 +562,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S0[i + 2] = L0; - S0[i + 3] = R0; + SET_KEY32 (S0, i + 2, L0); + SET_KEY32 (S0, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -532,16 +581,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S1[i + 2] = L0; - S1[i + 3] = R0; + SET_KEY32 (S1, i + 2, L0); + SET_KEY32 (S1, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -551,16 +600,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S2[i + 2] = L0; - S2[i + 3] = R0; + SET_KEY32 (S2, i + 2, L0); + SET_KEY32 (S2, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -570,16 +619,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S3[i + 2] = L0; - S3[i + 3] = R0; + SET_KEY32 (S3, i + 2, L0); + SET_KEY32 (S3, i + 3, R0); } // store @@ -591,10 +640,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - tmps[gid].S0[i] = S0[i]; - tmps[gid].S1[i] = S1[i]; - tmps[gid].S2[i] = S2[i]; - tmps[gid].S3[i] = S3[i]; + tmps[gid].S0[i] = GET_KEY32 (S0, i); + tmps[gid].S1[i] = GET_KEY32 (S1, i); + tmps[gid].S2[i] = GET_KEY32 (S2, i); + tmps[gid].S3[i] = GET_KEY32 (S3, i); } } @@ -626,16 +675,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -644,10 +697,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = tmps[gid].S0[i]; - S1[i] = tmps[gid].S1[i]; - S2[i] = tmps[gid].S2[i]; - S3[i] = tmps[gid].S3[i]; + SET_KEY32 (S0, i, tmps[gid].S0[i]); + SET_KEY32 (S1, i, tmps[gid].S1[i]); + SET_KEY32 (S2, i, tmps[gid].S2[i]); + SET_KEY32 (S3, i, tmps[gid].S3[i]); } /** @@ -690,32 +743,32 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS { BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); } P[ 0] ^= salt_buf[0]; @@ -752,32 +805,32 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS { BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); } } @@ -790,10 +843,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - tmps[gid].S0[i] = S0[i]; - tmps[gid].S1[i] = S1[i]; - tmps[gid].S2[i] = S2[i]; - tmps[gid].S3[i] = S3[i]; + tmps[gid].S0[i] = GET_KEY32 (S0, i); + tmps[gid].S1[i] = GET_KEY32 (S1, i); + tmps[gid].S2[i] = GET_KEY32 (S2, i); + tmps[gid].S3[i] = GET_KEY32 (S3, i); } } @@ -818,16 +871,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -836,10 +893,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = tmps[gid].S0[i]; - S1[i] = tmps[gid].S1[i]; - S2[i] = tmps[gid].S2[i]; - S3[i] = tmps[gid].S3[i]; + SET_KEY32 (S0, i, tmps[gid].S0[i]); + SET_KEY32 (S1, i, tmps[gid].S1[i]); + SET_KEY32 (S2, i, tmps[gid].S2[i]); + SET_KEY32 (S3, i, tmps[gid].S3[i]); } /** diff --git a/OpenCL/m25600-pure.cl b/OpenCL/m25600-pure.cl index e490679a7..9c03680e3 100644 --- a/OpenCL/m25600-pure.cl +++ b/OpenCL/m25600-pure.cl @@ -322,6 +322,51 @@ CONSTANT_VK u32a c_pbox[18] = 0x9216d5d9, 0x8979fb1b }; +// Yes, works only with CUDA atm + +#ifdef DYNAMIC_LOCAL +#define BCRYPT_AVOID_BANK_CONFLICTS +#endif + +#ifdef BCRYPT_AVOID_BANK_CONFLICTS + +// access pattern: minimize bank ID based on thread ID but thread ID is not saved from computation + +#define KEY32(lid,key) (((key) * FIXED_LOCAL_SIZE) + (lid)) + +DECLSPEC u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + const u64 lid = get_local_id (0); + + return S[KEY32 (lid, key)]; +} + +DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + const u64 lid = get_local_id (0); + + S[KEY32 (lid, key)] = val; +} + +#undef KEY32 + +#else + +// access pattern: linear access with S offset already set to right offset based on thread ID saving it from compuation +// makes sense if there are not thread ID's (for instance on CPU) + +DECLSPEC inline u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + return S[key]; +} + +DECLSPEC inline void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + S[key] = val; +} + +#endif + #define BF_ROUND(L,R,N) \ { \ u32 tmp; \ @@ -331,10 +376,10 @@ CONSTANT_VK u32a c_pbox[18] = const u32 r2 = unpack_v8b_from_v32_S ((L)); \ const u32 r3 = unpack_v8a_from_v32_S ((L)); \ \ - tmp = S0[r0]; \ - tmp += S1[r1]; \ - tmp ^= S2[r2]; \ - tmp += S3[r3]; \ + tmp = GET_KEY32 (S0, r0); \ + tmp += GET_KEY32 (S1, r1); \ + tmp ^= GET_KEY32 (S2, r2); \ + tmp += GET_KEY32 (S3, r3); \ \ (R) ^= tmp ^ P[(N)]; \ } @@ -370,7 +415,7 @@ CONSTANT_VK u32a c_pbox[18] = } #ifdef DYNAMIC_LOCAL -extern __shared__ u32 lm[]; +extern __shared__ u32 S[]; #endif DECLSPEC void expand_key (u32 *E, u32 *W, const int len) @@ -537,16 +582,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_init (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -555,10 +604,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_init (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = c_sbox0[i]; - S1[i] = c_sbox1[i]; - S2[i] = c_sbox2[i]; - S3[i] = c_sbox3[i]; + SET_KEY32 (S0, i, c_sbox0[i]); + SET_KEY32 (S1, i, c_sbox1[i]); + SET_KEY32 (S2, i, c_sbox2[i]); + SET_KEY32 (S3, i, c_sbox3[i]); } // expandstate @@ -589,16 +638,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S0[i + 2] = L0; - S0[i + 3] = R0; + SET_KEY32 (S0, i + 2, L0); + SET_KEY32 (S0, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -608,16 +657,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S1[i + 2] = L0; - S1[i + 3] = R0; + SET_KEY32 (S1, i + 2, L0); + SET_KEY32 (S1, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -627,16 +676,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S2[i + 2] = L0; - S2[i + 3] = R0; + SET_KEY32 (S2, i + 2, L0); + SET_KEY32 (S2, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -646,16 +695,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S3[i + 2] = L0; - S3[i + 3] = R0; + SET_KEY32 (S3, i + 2, L0); + SET_KEY32 (S3, i + 3, R0); } // store @@ -667,10 +716,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_init (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - tmps[gid].S0[i] = S0[i]; - tmps[gid].S1[i] = S1[i]; - tmps[gid].S2[i] = S2[i]; - tmps[gid].S3[i] = S3[i]; + tmps[gid].S0[i] = GET_KEY32 (S0, i); + tmps[gid].S1[i] = GET_KEY32 (S1, i); + tmps[gid].S2[i] = GET_KEY32 (S2, i); + tmps[gid].S3[i] = GET_KEY32 (S3, i); } } @@ -702,16 +751,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_loop (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -720,10 +773,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_loop (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = tmps[gid].S0[i]; - S1[i] = tmps[gid].S1[i]; - S2[i] = tmps[gid].S2[i]; - S3[i] = tmps[gid].S3[i]; + SET_KEY32 (S0, i, tmps[gid].S0[i]); + SET_KEY32 (S1, i, tmps[gid].S1[i]); + SET_KEY32 (S2, i, tmps[gid].S2[i]); + SET_KEY32 (S3, i, tmps[gid].S3[i]); } /** @@ -766,32 +819,32 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_loop (KERN_ATTR_TMPS { BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); } P[ 0] ^= salt_buf[0]; @@ -828,32 +881,32 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_loop (KERN_ATTR_TMPS { BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); } } @@ -866,10 +919,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_loop (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - tmps[gid].S0[i] = S0[i]; - tmps[gid].S1[i] = S1[i]; - tmps[gid].S2[i] = S2[i]; - tmps[gid].S3[i] = S3[i]; + tmps[gid].S0[i] = GET_KEY32 (S0, i); + tmps[gid].S1[i] = GET_KEY32 (S1, i); + tmps[gid].S2[i] = GET_KEY32 (S2, i); + tmps[gid].S3[i] = GET_KEY32 (S3, i); } } @@ -894,16 +947,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_comp (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -912,10 +969,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25600_comp (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = tmps[gid].S0[i]; - S1[i] = tmps[gid].S1[i]; - S2[i] = tmps[gid].S2[i]; - S3[i] = tmps[gid].S3[i]; + SET_KEY32 (S0, i, tmps[gid].S0[i]); + SET_KEY32 (S1, i, tmps[gid].S1[i]); + SET_KEY32 (S2, i, tmps[gid].S2[i]); + SET_KEY32 (S3, i, tmps[gid].S3[i]); } /** diff --git a/OpenCL/m25800-pure.cl b/OpenCL/m25800-pure.cl index 4205d12a0..2567986b5 100644 --- a/OpenCL/m25800-pure.cl +++ b/OpenCL/m25800-pure.cl @@ -322,6 +322,51 @@ CONSTANT_VK u32a c_pbox[18] = 0x9216d5d9, 0x8979fb1b }; +// Yes, works only with CUDA atm + +#ifdef DYNAMIC_LOCAL +#define BCRYPT_AVOID_BANK_CONFLICTS +#endif + +#ifdef BCRYPT_AVOID_BANK_CONFLICTS + +// access pattern: minimize bank ID based on thread ID but thread ID is not saved from computation + +#define KEY32(lid,key) (((key) * FIXED_LOCAL_SIZE) + (lid)) + +DECLSPEC u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + const u64 lid = get_local_id (0); + + return S[KEY32 (lid, key)]; +} + +DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + const u64 lid = get_local_id (0); + + S[KEY32 (lid, key)] = val; +} + +#undef KEY32 + +#else + +// access pattern: linear access with S offset already set to right offset based on thread ID saving it from compuation +// makes sense if there are not thread ID's (for instance on CPU) + +DECLSPEC inline u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key) +{ + return S[key]; +} + +DECLSPEC inline void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val) +{ + S[key] = val; +} + +#endif + #define BF_ROUND(L,R,N) \ { \ u32 tmp; \ @@ -331,10 +376,10 @@ CONSTANT_VK u32a c_pbox[18] = const u32 r2 = unpack_v8b_from_v32_S ((L)); \ const u32 r3 = unpack_v8a_from_v32_S ((L)); \ \ - tmp = S0[r0]; \ - tmp += S1[r1]; \ - tmp ^= S2[r2]; \ - tmp += S3[r3]; \ + tmp = GET_KEY32 (S0, r0); \ + tmp += GET_KEY32 (S1, r1); \ + tmp ^= GET_KEY32 (S2, r2); \ + tmp += GET_KEY32 (S3, r3); \ \ (R) ^= tmp ^ P[(N)]; \ } @@ -370,7 +415,7 @@ CONSTANT_VK u32a c_pbox[18] = } #ifdef DYNAMIC_LOCAL -extern __shared__ u32 lm[]; +extern __shared__ u32 S[]; #endif DECLSPEC void expand_key (u32 *E, u32 *W, const int len) @@ -540,16 +585,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_init (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -558,10 +607,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_init (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = c_sbox0[i]; - S1[i] = c_sbox1[i]; - S2[i] = c_sbox2[i]; - S3[i] = c_sbox3[i]; + SET_KEY32 (S0, i, c_sbox0[i]); + SET_KEY32 (S1, i, c_sbox1[i]); + SET_KEY32 (S2, i, c_sbox2[i]); + SET_KEY32 (S3, i, c_sbox3[i]); } // expandstate @@ -592,16 +641,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S0[i + 2] = L0; - S0[i + 3] = R0; + SET_KEY32 (S0, i + 2, L0); + SET_KEY32 (S0, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -611,16 +660,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S1[i + 2] = L0; - S1[i + 3] = R0; + SET_KEY32 (S1, i + 2, L0); + SET_KEY32 (S1, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -630,16 +679,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S2[i + 2] = L0; - S2[i + 3] = R0; + SET_KEY32 (S2, i + 2, L0); + SET_KEY32 (S2, i + 3, R0); } for (u32 i = 0; i < 256; i += 4) @@ -649,16 +698,16 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_init (KERN_ATTR_TMPS BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); L0 ^= salt_buf[0]; R0 ^= salt_buf[1]; BF_ENCRYPT (L0, R0); - S3[i + 2] = L0; - S3[i + 3] = R0; + SET_KEY32 (S3, i + 2, L0); + SET_KEY32 (S3, i + 3, R0); } // store @@ -670,10 +719,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_init (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - tmps[gid].S0[i] = S0[i]; - tmps[gid].S1[i] = S1[i]; - tmps[gid].S2[i] = S2[i]; - tmps[gid].S3[i] = S3[i]; + tmps[gid].S0[i] = GET_KEY32 (S0, i); + tmps[gid].S1[i] = GET_KEY32 (S1, i); + tmps[gid].S2[i] = GET_KEY32 (S2, i); + tmps[gid].S3[i] = GET_KEY32 (S3, i); } } @@ -705,16 +754,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_loop (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -723,10 +776,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_loop (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = tmps[gid].S0[i]; - S1[i] = tmps[gid].S1[i]; - S2[i] = tmps[gid].S2[i]; - S3[i] = tmps[gid].S3[i]; + SET_KEY32 (S0, i, tmps[gid].S0[i]); + SET_KEY32 (S1, i, tmps[gid].S1[i]); + SET_KEY32 (S2, i, tmps[gid].S2[i]); + SET_KEY32 (S3, i, tmps[gid].S3[i]); } /** @@ -769,32 +822,32 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_loop (KERN_ATTR_TMPS { BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); } P[ 0] ^= salt_buf[0]; @@ -831,32 +884,32 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_loop (KERN_ATTR_TMPS { BF_ENCRYPT (L0, R0); - S0[i + 0] = L0; - S0[i + 1] = R0; + SET_KEY32 (S0, i + 0, L0); + SET_KEY32 (S0, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S1[i + 0] = L0; - S1[i + 1] = R0; + SET_KEY32 (S1, i + 0, L0); + SET_KEY32 (S1, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S2[i + 0] = L0; - S2[i + 1] = R0; + SET_KEY32 (S2, i + 0, L0); + SET_KEY32 (S2, i + 1, R0); } for (u32 i = 0; i < 256; i += 2) { BF_ENCRYPT (L0, R0); - S3[i + 0] = L0; - S3[i + 1] = R0; + SET_KEY32 (S3, i + 0, L0); + SET_KEY32 (S3, i + 1, R0); } } @@ -869,10 +922,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_loop (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - tmps[gid].S0[i] = S0[i]; - tmps[gid].S1[i] = S1[i]; - tmps[gid].S2[i] = S2[i]; - tmps[gid].S3[i] = S3[i]; + tmps[gid].S0[i] = GET_KEY32 (S0, i); + tmps[gid].S1[i] = GET_KEY32 (S1, i); + tmps[gid].S2[i] = GET_KEY32 (S2, i); + tmps[gid].S3[i] = GET_KEY32 (S3, i); } } @@ -897,16 +950,20 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_comp (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; - LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; - LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; - LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; + // from host #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256]; + #endif + #ifdef BCRYPT_AVOID_BANK_CONFLICTS + LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0); + LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1); + LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2); + LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3); + #else LOCAL_AS u32 *S0 = S0_all[lid]; LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; @@ -915,10 +972,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m25800_comp (KERN_ATTR_TMPS for (u32 i = 0; i < 256; i++) { - S0[i] = tmps[gid].S0[i]; - S1[i] = tmps[gid].S1[i]; - S2[i] = tmps[gid].S2[i]; - S3[i] = tmps[gid].S3[i]; + SET_KEY32 (S0, i, tmps[gid].S0[i]); + SET_KEY32 (S1, i, tmps[gid].S1[i]); + SET_KEY32 (S2, i, tmps[gid].S2[i]); + SET_KEY32 (S3, i, tmps[gid].S3[i]); } /** diff --git a/src/backend.c b/src/backend.c index cce9d70a3..a7593ccf2 100644 --- a/src/backend.c +++ b/src/backend.c @@ -3685,17 +3685,17 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con dynamic_shared_mem = 0; } - if (device_param->is_cuda == true) - { - if ((device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) == 0) - { + //if (device_param->is_cuda == true) + //{ + //if ((device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) == 0) + //{ // this is the case Compute Capability 7.5 // there is also Compute Capability 7.0 which offers a larger dynamic local size access // however, if it's an exact multiple the driver can optimize this for us more efficient - dynamic_shared_mem = 0; - } - } + //dynamic_shared_mem = 0; + //} + //} kernel_threads = MIN (kernel_threads, device_param->kernel_threads); diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c index 73e92f304..458526db6 100644 --- a/src/modules/module_03200.c +++ b/src/modules/module_03200.c @@ -102,14 +102,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY if (device_param->is_cuda == true) { - if (device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) - { - // this is the case Compute Capability 7.5 - // there is also Compute Capability 7.0 which offers a larger dynamic local size access - // however, if it's an exact multiple the driver can optimize this for us more efficient - - use_dynamic = true; - } + use_dynamic = true; } // this uses some nice feedback effect. diff --git a/src/modules/module_25600.c b/src/modules/module_25600.c index 2615640a6..561c04d77 100644 --- a/src/modules/module_25600.c +++ b/src/modules/module_25600.c @@ -83,14 +83,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY if (device_param->is_cuda == true) { - if (device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) - { - // this is the case Compute Capability 7.5 - // there is also Compute Capability 7.0 which offers a larger dynamic local size access - // however, if it's an exact multiple the driver can optimize this for us more efficient - - use_dynamic = true; - } + use_dynamic = true; } // this uses some nice feedback effect. diff --git a/src/modules/module_25800.c b/src/modules/module_25800.c index 444f6595e..154e9bc5c 100644 --- a/src/modules/module_25800.c +++ b/src/modules/module_25800.c @@ -83,14 +83,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY if (device_param->is_cuda == true) { - if (device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) - { - // this is the case Compute Capability 7.5 - // there is also Compute Capability 7.0 which offers a larger dynamic local size access - // however, if it's an exact multiple the driver can optimize this for us more efficient - - use_dynamic = true; - } + use_dynamic = true; } // this uses some nice feedback effect.