OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count

2025-07-31 19:08:43 +00:00 · 2018-02-11 10:56:08 +01:00 · 2018-02-11 10:56:08 +01:00 · d656e9c3a4
commit d656e9c3a4
parent a7a15df911
9 changed files with 130 additions and 17 deletions
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@ -61724,7 +61724,8 @@ __kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max)
  pw.i[62] = 0;
  pw.i[63] = 0; // yep that's faster

-  pw.pw_len = 1 + (l32 & 15);
+  //pw.pw_len = 1 + (l32 & 15);
+  pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500)

  buf[gid] = pw;
 }
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -63,6 +63,7 @@
 - OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
 - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
 - OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
+- OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count
 - OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
 - OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
 - OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
--- a/include/shared.h
+++ b/include/shared.h
@ -61,4 +61,7 @@ bool hc_same_files (char *file1, char *file2);
 u32 hc_strtoul  (const char *nptr, char **endptr, int base);
 u64 hc_strtoull (const char *nptr, char **endptr, int base);

+u32 power_of_two_ceil_32  (const u32 v);
+u32 power_of_two_floor_32 (const u32 v);
+
 #endif // _SHARED_H
--- a/include/types.h
+++ b/include/types.h
@ -138,14 +138,19 @@ typedef enum amplifier_count
  KERNEL_BFS                        = 1024,
  KERNEL_COMBS                      = 1024,
  KERNEL_RULES                      = 256,
-  KERNEL_THREADS_MAX_CPU            = 1,
-  KERNEL_THREADS_MAX_GPU            = 8,  // ex: intel integrated
-  KERNEL_THREADS_MAX_GPU_NV         = 32, // optimized NV  size: warps
-  KERNEL_THREADS_MAX_GPU_AMD        = 64, // optimized AMD size: wavefronts
-  KERNEL_THREADS_MAX_OTHER          = 8,  // ex: intel MIC

 } amplifier_count_t;

+typedef enum native_threads
+{
+  KERNEL_THREADS_NATIVE_CPU         = 1,
+  KERNEL_THREADS_NATIVE_GPU         = 8,  // ex: intel integrated
+  KERNEL_THREADS_NATIVE_GPU_NV      = 32, // optimized NV  size: warps
+  KERNEL_THREADS_NATIVE_GPU_AMD     = 64, // optimized AMD size: wavefronts
+  KERNEL_THREADS_NATIVE_OTHER       = 8,  // ex: intel MIC
+
+} native_threads_t;
+
 typedef enum vendor_id
 {
  VENDOR_ID_AMD           = (1 << 0),
@ -923,6 +928,7 @@ typedef struct hc_device_param
  u64     device_global_mem;
  u32     device_maxclock_frequency;
  size_t  device_maxworkgroup_size;
+  u64     device_local_mem_size;

  u32     vector_width;

--- a/src/hashes.c
+++ b/src/hashes.c
@ -351,11 +351,11 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
    return -1;
  }

-  // we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
-  // but sometimes, when a benchmark kernel run cracks a kernel, we don't want to see that!
-
  if (user_options->speed_only == true)
  {
+    // we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
+    // however if the benchmark cracks the artificial hash used for benchmarks we don't want to see that!
+
    return 0;
  }

--- a/src/interface.c
+++ b/src/interface.c
@ -25970,6 +25970,66 @@ u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx)
  return kernel_threads;
 }

+u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
+{
+  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+
+  u32 kernel_threads = 0;
+
+  // sometimes there's a high kernel requirement for local memory (which is multiplied with threads)
+
+  u32 local_mem_per_thread = 0;
+
+  // basically the sum of all .local space of the _loop kernel
+  // see .ptx
+
+  if (hashconfig->hash_mode ==  1800) local_mem_per_thread = 1024;
+  if (hashconfig->hash_mode == 12500) local_mem_per_thread = 3296;
+  if (hashconfig->hash_mode == 13400) local_mem_per_thread = 5360;
+
+  if (local_mem_per_thread)
+  {
+    const u32 device_local_mem_size = (const u32) device_param->device_local_mem_size;
+
+    kernel_threads = device_local_mem_size / local_mem_per_thread;
+
+    // there can be some very unaligned results from this, therefore round it down to next power of two
+
+    kernel_threads = power_of_two_floor_32 (kernel_threads);
+  }
+
+  // make sure to not underpower
+
+  if (kernel_threads)
+  {
+    if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+    {
+      kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
+    }
+    else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if (device_param->device_vendor_id == VENDOR_ID_NV)
+      {
+        kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_NV);
+      }
+      else if (device_param->device_vendor_id == VENDOR_ID_AMD)
+      {
+        kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_AMD);
+      }
+      else
+      {
+        kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU);
+      }
+    }
+    else
+    {
+      kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_OTHER);
+    }
+  }
+
+  return kernel_threads;
+}
+
 u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
 {
  const user_options_t *user_options = hashcat_ctx->user_options;
@ -25980,13 +26040,22 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p

  if (forced_kernel_threads) return forced_kernel_threads;

-  // otherwise it depends on the opencl device type
+  // it can also depends on the opencl device type

-  u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
+  u32 kernel_threads = (u32) device_param->device_maxworkgroup_size;

  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
  {
-    kernel_threads = MIN (kernel_threads, KERNEL_THREADS_MAX_CPU);
+    kernel_threads = MIN (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
+  }
+
+  // or if it requires for example a lot of local memory
+
+  const u32 limited_kernel_threads = hashconfig_limited_kernel_threads (hashcat_ctx, device_param);
+
+  if (limited_kernel_threads)
+  {
+    kernel_threads = MIN (kernel_threads, limited_kernel_threads);
  }

  return kernel_threads;
@ -25994,7 +26063,7 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p

 u32 hashconfig_get_kernel_loops (hashcat_ctx_t *hashcat_ctx)
 {
-  hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
+  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
  const user_options_t *user_options = hashcat_ctx->user_options;

  u32 kernel_loops_fixed = 0;
--- a/src/opencl.c
+++ b/src/opencl.c
@ -3306,6 +3306,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
        device_param->skipped = true;
      }

+      device_param->device_local_mem_size = device_local_mem_size;
+
      // If there's both an Intel CPU and an AMD OpenCL runtime it's a tricky situation
      // Both platforms support CPU device types and therefore both will try to use 100% of the physical resources
      // This results in both utilizing it for 50%
--- a/src/shared.c
+++ b/src/shared.c
@ -506,3 +506,32 @@ u64 hc_strtoull (const char *nptr, char **endptr, int base)
 {
  return (u64) strtoull (nptr, endptr, base);
 }
+
+u32 power_of_two_ceil_32 (const u32 v)
+{
+  u32 r = v;
+
+  r--;
+
+  r |= r >> 1;
+  r |= r >> 2;
+  r |= r >> 4;
+  r |= r >> 8;
+  r |= r >> 16;
+
+  r++;
+
+  return r;
+}
+
+u32 power_of_two_floor_32 (const u32 v)
+{
+  u32 r = power_of_two_ceil_32 (v);
+
+  if (r > v)
+  {
+    r >>= 1;
+  }
+
+  return r;
+}
--- a/src/status.c
+++ b/src/status.c
@ -1389,15 +1389,17 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d

  if (device_param->skipped == false)
  {
-    for (int i = 0; i < SPEED_CACHE; i++)
+    const u32 speed_pos = device_param->speed_pos;
+
+    for (int i = 0; i < speed_pos; i++)
    {
      speed_cnt  += device_param->speed_cnt[i];
      speed_msec += device_param->speed_msec[i];
    }
-  }

-  speed_cnt  /= SPEED_CACHE;
-  speed_msec /= SPEED_CACHE;
+    speed_cnt  /= speed_pos;
+    speed_msec /= speed_pos;
+  }

  double hashes_dev_msec = 0;