From 24094793dabe3ede1d3cb0bdfc787ce80746afae Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Wed, 18 Mar 2020 16:13:57 +0100 Subject: [PATCH] Workaround for -m 22100 on NVIDIA --- OpenCL/m22100-pure.cl | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/OpenCL/m22100-pure.cl b/OpenCL/m22100-pure.cl index af98a712a..3a5f60b84 100644 --- a/OpenCL/m22100-pure.cl +++ b/OpenCL/m22100-pure.cl @@ -265,11 +265,35 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t)) { #ifdef REAL_SHM - for (u32 i = lid; i < FIXED_ITER_INCR; i += lsz) + /** + * On NVIDIA, the __sync_threads() is not working as expected if called from inside a loop. + * This is a unique situation across all hashcat kernels so far. + * From CUDA manual: + * __syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, + * otherwise the code execution is likely to hang or produce unintended side effects. + * NVIDIA OpenCL runtime is also affected, but other OpenCL runtimes work as they should. + * An workaround exists by disabling shared memory access. Speed drop is around 4%. + * Another workaround is to let only a single thread do all the work while all other threads wait for it to finish. Speed drop is around 0.05%. + + // original code + for (int i = lid; i < FIXED_ITER_INCR; i += lsz) + { + for (int j = 0; j < 48; j++) // first 16 set to register + { + s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + t + i][j]; + } + } + + */ + + if (lid == 0) { - for (u32 j = 0; j < 48; j++) // first 16 set to register + for (int i = 0; i < FIXED_ITER_INCR; i++) { - s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + t + i][j]; + for (int j = 0; j < 48; j++) // first 16 set to register + { + s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + t + i][j]; + } } }