From b0ff6258447a3098480a233f672ade8cf3003709 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Fri, 9 Feb 2018 16:42:28 +0100
Subject: [PATCH] OpenCL Kernels: Thread-count is switched from native to
 maximum - as a consequence we assume host memory pool of 2GB per GPU

---
 docs/changes.txt |  1 +
 src/interface.c  | 48 ++----------------------------------------------
 src/opencl.c     | 20 ++++++++++++++++++++
 3 files changed, 23 insertions(+), 46 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index a13d8c3c9..b6cacc05a 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -61,6 +61,7 @@
 - OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
 - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
 - OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
+- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
 - OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
 - OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
 - OpenCL Runtime: Updated rocm detection
diff --git a/src/interface.c b/src/interface.c
index f81f9109c..9d422c020 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -25980,53 +25980,9 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
 
   if (forced_kernel_threads) return forced_kernel_threads;
 
-  // otherwise it depends on (1) the opencl device type
+  // otherwise it depends on the opencl device type
 
-  u32 kernel_threads = 0;
-
-  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
-  {
-    kernel_threads = KERNEL_THREADS_MAX_CPU;
-  }
-  else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
-  {
-    if (device_param->device_vendor_id == VENDOR_ID_NV)
-    {
-      kernel_threads = KERNEL_THREADS_MAX_GPU_NV;
-    }
-    else if (device_param->device_vendor_id == VENDOR_ID_AMD)
-    {
-      kernel_threads = KERNEL_THREADS_MAX_GPU_AMD;
-    }
-    else
-    {
-      kernel_threads = KERNEL_THREADS_MAX_GPU;
-    }
-  }
-  else
-  {
-    kernel_threads = KERNEL_THREADS_MAX_OTHER;
-  }
-
-  if (user_options->workload_profile == 4)
-  {
-    kernel_threads = (u32) device_param->device_maxworkgroup_size;
-  }
-
-  // and (2) an opencl device can force an lower value (limited resources on device)
-
-  kernel_threads = MIN (kernel_threads, (u32) device_param->device_maxworkgroup_size);
-
-  // and (3) if an OpenCL device allows a very high thread count (for example 1024 on nvidia),
-  // the host memory required is 32 times as high with 32 (It jumps from 128MB to 4GB device memory requirement).
-  // since there's no device with that much device memory (because of 1/4 memory rule) it has to limit the
-  // kernel_accel_max to be a very low number because the pws buffer will be so large otherwise.
-  // therefore autotune will be unable to calculate a good kernel_accel multiplier.
-  // currently there's no OpenCL device known that needs result in a better performance with 1024 threads compared to 256.
-  // as a result, we limit the number of threads to 64, which turns out to be a general good value.
-  // there's a 1.00% - 2.75% performance drop at NV caused by this, and 0.00% - 1.02% at AMD.
-
-  kernel_threads = MIN (kernel_threads, 64);
+  const u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
 
   return kernel_threads;
 }
diff --git a/src/opencl.c b/src/opencl.c
index 7119760cb..f01d1d12f 100644
--- a/src/opencl.c
+++ b/src/opencl.c
@@ -4078,6 +4078,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
       device_param->kernel_loops_max = user_options->kernel_loops;
     }
 
+    // limit scrypt accel otherwise we hurt ourself when calculating the scrypt tmto
+
+    if ((hashconfig->hash_mode == 8900) || (hashconfig->hash_mode == 9300) || (hashconfig->hash_mode == 15700))
+    {
+      // 16 is actually a bit low, we may need to change this depending on user response
+
+      device_param->kernel_accel_max = MIN (device_param->kernel_accel_max, 16);
+    }
+
     // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
 
     if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@@ -4356,6 +4365,12 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
     size_t size_tmps     = 4;
     size_t size_hooks    = 4;
 
+    // instead of adding a thread limit we can also use a memory limit.
+    // this value should represent a reasonable amount of memory a host system has per GPU.
+    // note we're allocating 3 blocks of that size.
+
+    #define PWS_SPACE (512 * 1024 * 1024)
+
     // sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
     // let's add some extra space just to be sure.
 
@@ -4392,6 +4407,11 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
 
       int memory_limit_hit = 0;
 
+      if (user_options->workload_profile < 4)
+      {
+        if (size_pws > PWS_SPACE) memory_limit_hit = 1;
+      }
+
       if ((size_pws   + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
       if ((size_tmps  + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
       if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;