From fb82bfc169752c6dfc6cd20d6d97f649cf7c2992 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Wed, 8 May 2019 23:30:07 +0200
Subject: [PATCH] Improve thread handling based on FIXED_LOCAL_SIZE

---
 src/backend.c              | 40 ++++++++++++--------------------------
 src/modules/module_03200.c |  7 -------
 src/modules/module_09000.c | 37 +++++++++++++++++++++++++++--------
 src/modules/module_18600.c | 37 +++++++++++++++++++++++++++--------
 4 files changed, 70 insertions(+), 51 deletions(-)

diff --git a/src/backend.c b/src/backend.c
index c5ea4bf1f..ed88b191f 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -3093,33 +3093,6 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
   if (device_param->is_cuda == true)
   {
-    u64 local_mem_size = 0;
-
-    switch (kern_run)
-    {
-      case KERN_RUN_1:      local_mem_size  = device_param->kernel_local_mem_size1;       break;
-      case KERN_RUN_12:     local_mem_size  = device_param->kernel_local_mem_size12;      break;
-      case KERN_RUN_2:      local_mem_size  = device_param->kernel_local_mem_size2;       break;
-      case KERN_RUN_23:     local_mem_size  = device_param->kernel_local_mem_size23;      break;
-      case KERN_RUN_3:      local_mem_size  = device_param->kernel_local_mem_size3;       break;
-      case KERN_RUN_4:      local_mem_size  = device_param->kernel_local_mem_size4;       break;
-      case KERN_RUN_INIT2:  local_mem_size  = device_param->kernel_local_mem_size_init2;  break;
-      case KERN_RUN_LOOP2:  local_mem_size  = device_param->kernel_local_mem_size_loop2;  break;
-      case KERN_RUN_AUX1:   local_mem_size  = device_param->kernel_local_mem_size_aux1;   break;
-      case KERN_RUN_AUX2:   local_mem_size  = device_param->kernel_local_mem_size_aux2;   break;
-      case KERN_RUN_AUX3:   local_mem_size  = device_param->kernel_local_mem_size_aux3;   break;
-      case KERN_RUN_AUX4:   local_mem_size  = device_param->kernel_local_mem_size_aux4;   break;
-    }
-
-    /*
-    if (local_mem_size)
-    {
-      const u32 max_threads_possible = (device_param->device_local_mem_size - 240) / local_mem_size;
-
-      kernel_threads = MIN (kernel_threads, max_threads_possible);
-    }
-    */
-
     CUfunction cuda_function = NULL;
 
     if (device_param->is_cuda == true)
@@ -7039,7 +7012,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
-    // there's not thread column in tuning db, stick to commandline if defined
+    // there's no thread column in tuning db, stick to commandline if defined
 
     if (user_options->kernel_threads_chgd == true)
     {
@@ -7291,6 +7264,17 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (jit_build_options != NULL)
       {
         build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options);
+
+        // this is a bit ugly
+        // would be better to have the module return the value as value
+
+        u32 fixed_local_size = 0;
+
+        if (sscanf (jit_build_options, "-D FIXED_LOCAL_SIZE=%u", &fixed_local_size) == 1)
+        {
+          device_param->kernel_threads_min = fixed_local_size;
+          device_param->kernel_threads_max = fixed_local_size;
+        }
       }
     }
 
diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c
index 6cd15c7c7..b0b35b627 100644
--- a/src/modules/module_03200.c
+++ b/src/modules/module_03200.c
@@ -108,13 +108,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
       {
         overhead = 4;
       }
-
-      // no clue yet where this is coming from
-
-      if (device_param->is_cuda == true)
-      {
-        overhead = 240;
-      }
     }
 
     if (user_options->kernel_threads_chgd == true)
diff --git a/src/modules/module_09000.c b/src/modules/module_09000.c
index 8817fd4b6..e8cdac075 100644
--- a/src/modules/module_09000.c
+++ b/src/modules/module_09000.c
@@ -74,6 +74,11 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
+  // this uses some nice feedback effect.
+  // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
+  // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
+  // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
+
   u32 fixed_local_size = 0;
 
   if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
@@ -82,19 +87,35 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   }
   else
   {
-    if (user_options->kernel_threads_chgd == true)
-    {
-      fixed_local_size = user_options->kernel_threads;
-    }
-    else
-    {
-      u32 overhead = 0;
+    u32 overhead = 0;
 
-      if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+      // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+      // on my development system. no clue where the 4 bytes are spent.
+      // I did some research on this and it seems to be related with the datatype.
+      // For example, if i used u8 instead, there's only 1 byte wasted.
+
+      if (device_param->is_opencl == true)
       {
         overhead = 4;
       }
+    }
 
+    if (user_options->kernel_threads_chgd == true)
+    {
+      fixed_local_size = user_options->kernel_threads;
+
+      // otherwise out-of-bound reads
+
+      if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
+      {
+        fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+      }
+    }
+    else
+    {
       fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
     }
   }
diff --git a/src/modules/module_18600.c b/src/modules/module_18600.c
index 109a3f65c..663717538 100644
--- a/src/modules/module_18600.c
+++ b/src/modules/module_18600.c
@@ -66,6 +66,11 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
+  // this uses some nice feedback effect.
+  // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
+  // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
+  // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
+
   u32 fixed_local_size = 0;
 
   if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
@@ -74,19 +79,35 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   }
   else
   {
-    if (user_options->kernel_threads_chgd == true)
-    {
-      fixed_local_size = user_options->kernel_threads;
-    }
-    else
-    {
-      u32 overhead = 0;
+    u32 overhead = 0;
 
-      if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
+    {
+      // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+      // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+      // on my development system. no clue where the 4 bytes are spent.
+      // I did some research on this and it seems to be related with the datatype.
+      // For example, if i used u8 instead, there's only 1 byte wasted.
+
+      if (device_param->is_opencl == true)
       {
         overhead = 4;
       }
+    }
 
+    if (user_options->kernel_threads_chgd == true)
+    {
+      fixed_local_size = user_options->kernel_threads;
+
+      // otherwise out-of-bound reads
+
+      if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
+      {
+        fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+      }
+    }
+    else
+    {
       fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
     }
   }