OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU

2024-12-23 15:18:16 +00:00 · 2018-02-09 16:42:28 +01:00 · 2018-02-09 16:42:28 +01:00 · b0ff625844
commit b0ff625844
parent 088c45040b
3 changed files with 23 additions and 46 deletions
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -61,6 +61,7 @@
 - OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
 - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
 - OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
 - OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
 - OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
 - OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
 - OpenCL Runtime: Updated rocm detection
--- a/src/interface.c
+++ b/src/interface.c
@ -25980,53 +25980,9 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
  if (forced_kernel_threads) return forced_kernel_threads;
-  // otherwise it depends on (1) the opencl device type
+  // otherwise it depends on the opencl device type
-  u32 kernel_threads = 0;
+  const u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
  {
    kernel_threads = KERNEL_THREADS_MAX_CPU;
  }
  else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
  {
    if (device_param->device_vendor_id == VENDOR_ID_NV)
    {
      kernel_threads = KERNEL_THREADS_MAX_GPU_NV;
    }
    else if (device_param->device_vendor_id == VENDOR_ID_AMD)
    {
      kernel_threads = KERNEL_THREADS_MAX_GPU_AMD;
    }
    else
    {
      kernel_threads = KERNEL_THREADS_MAX_GPU;
    }
  }
  else
  {
    kernel_threads = KERNEL_THREADS_MAX_OTHER;
  }
  if (user_options->workload_profile == 4)
  {
    kernel_threads = (u32) device_param->device_maxworkgroup_size;
  }
  // and (2) an opencl device can force an lower value (limited resources on device)
  kernel_threads = MIN (kernel_threads, (u32) device_param->device_maxworkgroup_size);
  // and (3) if an OpenCL device allows a very high thread count (for example 1024 on nvidia),
  // the host memory required is 32 times as high with 32 (It jumps from 128MB to 4GB device memory requirement).
  // since there's no device with that much device memory (because of 1/4 memory rule) it has to limit the
  // kernel_accel_max to be a very low number because the pws buffer will be so large otherwise.
  // therefore autotune will be unable to calculate a good kernel_accel multiplier.
  // currently there's no OpenCL device known that needs result in a better performance with 1024 threads compared to 256.
  // as a result, we limit the number of threads to 64, which turns out to be a general good value.
  // there's a 1.00% - 2.75% performance drop at NV caused by this, and 0.00% - 1.02% at AMD.
  kernel_threads = MIN (kernel_threads, 64);
  return kernel_threads;
 }
--- a/src/opencl.c
+++ b/src/opencl.c
@ -4078,6 +4078,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
      device_param->kernel_loops_max = user_options->kernel_loops;
    }
    // limit scrypt accel otherwise we hurt ourself when calculating the scrypt tmto
    if ((hashconfig->hash_mode == 8900) || (hashconfig->hash_mode == 9300) || (hashconfig->hash_mode == 15700))
    {
      // 16 is actually a bit low, we may need to change this depending on user response
      device_param->kernel_accel_max = MIN (device_param->kernel_accel_max, 16);
    }
    // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@ -4356,6 +4365,12 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
    size_t size_tmps     = 4;
    size_t size_hooks    = 4;
    // instead of adding a thread limit we can also use a memory limit.
    // this value should represent a reasonable amount of memory a host system has per GPU.
    // note we're allocating 3 blocks of that size.
    #define PWS_SPACE (512 * 1024 * 1024)
    // sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
    // let's add some extra space just to be sure.
@ -4392,6 +4407,11 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
      int memory_limit_hit = 0;
      if (user_options->workload_profile < 4)
      {
        if (size_pws > PWS_SPACE) memory_limit_hit = 1;
      }
      if ((size_pws   + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
      if ((size_tmps  + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
      if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;