No longer need to use 32 threads on second dimension for bitsliced algorithms

2025-07-04 05:42:35 +00:00 · 2018-02-20 01:01:50 +01:00 · 2018-02-20 01:01:50 +01:00 · ca1115a1ee
commit ca1115a1ee
parent ad50883080
5 changed files with 21 additions and 31 deletions
--- a/OpenCL/m01500_a3.cl
+++ b/OpenCL/m01500_a3.cl
@ -1664,18 +1664,18 @@ DECLSPEC void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32

 DECLSPEC void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63)
 {
-  sXXX_DECL u32 s001 = (0x001 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s002 = (0x002 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s004 = (0x004 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s008 = (0x008 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s010 = (0x010 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s020 = (0x020 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s040 = (0x040 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s080 = (0x080 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s100 = (0x100 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s200 = (0x200 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s400 = (0x400 & SALT) ? 0xffffffff : 0;
-  sXXX_DECL u32 s800 = (0x800 & SALT) ? 0xffffffff : 0;
+  const u32 s001 = (0x001 & SALT) ? 0xffffffff : 0;
+  const u32 s002 = (0x002 & SALT) ? 0xffffffff : 0;
+  const u32 s004 = (0x004 & SALT) ? 0xffffffff : 0;
+  const u32 s008 = (0x008 & SALT) ? 0xffffffff : 0;
+  const u32 s010 = (0x010 & SALT) ? 0xffffffff : 0;
+  const u32 s020 = (0x020 & SALT) ? 0xffffffff : 0;
+  const u32 s040 = (0x040 & SALT) ? 0xffffffff : 0;
+  const u32 s080 = (0x080 & SALT) ? 0xffffffff : 0;
+  const u32 s100 = (0x100 & SALT) ? 0xffffffff : 0;
+  const u32 s200 = (0x200 & SALT) ? 0xffffffff : 0;
+  const u32 s400 = (0x400 & SALT) ? 0xffffffff : 0;
+  const u32 s800 = (0x800 & SALT) ? 0xffffffff : 0;

  KXX_DECL u32 k00, k01, k02, k03, k04, k05;
  KXX_DECL u32 k06, k07, k08, k09, k10, k11;
@ -1967,7 +1967,7 @@ DECLSPEC void m01500m (__global pw_t *pws, __global const kernel_rule_t *rules_b
   * inner loop
   */

-  const u32 pc_pos = get_local_id (1);
+  const u32 pc_pos = get_global_id (1);

  const u32 il_pos = pc_pos * 32;

@ -2411,7 +2411,7 @@ DECLSPEC void m01500s (__global pw_t *pws, __global const kernel_rule_t *rules_b
   * inner loop
   */

-  const u32 pc_pos = get_local_id (1);
+  const u32 pc_pos = get_global_id (1);

  const u32 il_pos = pc_pos * 32;

--- a/OpenCL/m03000_a3.cl
+++ b/OpenCL/m03000_a3.cl
@ -1803,7 +1803,7 @@ DECLSPEC void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_b
   * inner loop
   */

-  const u32 pc_pos = get_local_id (1);
+  const u32 pc_pos = get_global_id (1);

  const u32 il_pos = pc_pos * 32;

@ -2247,7 +2247,7 @@ DECLSPEC void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_b
   * inner loop
   */

-  const u32 pc_pos = get_local_id (1);
+  const u32 pc_pos = get_global_id (1);

  const u32 il_pos = pc_pos * 32;

--- a/OpenCL/m14000_a3.cl
+++ b/OpenCL/m14000_a3.cl
@ -2014,7 +2014,7 @@ DECLSPEC void m14000m (__global pw_t *pws, __global const kernel_rule_t *rules_b
   * inner loop
   */

-  const u32 pc_pos = get_local_id (1);
+  const u32 pc_pos = get_global_id (1);

  const u32 il_pos = pc_pos * 32;

@ -2522,7 +2522,7 @@ DECLSPEC void m14000s (__global pw_t *pws, __global const kernel_rule_t *rules_b
   * inner loop
   */

-  const u32 pc_pos = get_local_id (1);
+  const u32 pc_pos = get_global_id (1);

  const u32 il_pos = pc_pos * 32;

--- a/src/interface.c
+++ b/src/interface.c
@ -25991,17 +25991,7 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p

  // for CPU we just do 1 ...

-  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
-  {
-    // ... as long as it is not a bitsliced kernel, as they have a fixed 2nd dimension size of 32 in run_kernel
-
-    if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
-    {
-      return 32;
-    }
-
-    return 1;
-  }
+  if (device_param->device_type & CL_DEVICE_TYPE_CPU) return 1;

  // this is an upper limit, a good start, since our strategy is to reduce thread counts only

--- a/src/opencl.c
+++ b/src/opencl.c
@ -1602,8 +1602,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con

  if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
  {
-    const size_t global_work_size[3] = { num_elements,        32, 1 };
-    const size_t local_work_size[3]  = { kernel_threads / 32, 32, 1 };
+    const size_t global_work_size[3] = { num_elements,  32, 1 };
+    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };

    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &event);