From 5e0eb288c90f559d27861eb67d4652f7372ce00b Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sun, 16 Jun 2019 18:01:26 +0200
Subject: [PATCH] Use __launch_bounds__ in CUDA as replacement for
 reqd_work_group_size() in OpenCL

---
 OpenCL/inc_platform.cl |  3 +++
 OpenCL/m03200-pure.cl  |  6 +++---
 OpenCL/m09000-pure.cl  |  4 ++--
 OpenCL/m18600-pure.cl  |  2 +-
 include/backend.h      |  2 +-
 src/backend.c          | 30 ++++++++++--------------------
 6 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 16761cc27..b3ee39697 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -8,6 +8,7 @@
 #include "inc_platform.h"
 
 #ifdef IS_NATIVE
+#define FIXED_THREAD_COUNT(n)
 #define SYNC_THREADS()
 #endif
 
@@ -107,9 +108,11 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
   return ((a >> n) | ((a << (64 - n))));
 }
 
+#define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0)
 #define SYNC_THREADS() __syncthreads ()
 #endif
 
 #ifdef IS_OPENCL
+#define FIXED_THREAD_COUNT(n) __attribute__((reqd_work_group_size((n), 1, 1)))
 #define SYNC_THREADS() barrier (CLK_LOCAL_MEM_FENCE)
 #endif
diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl
index 7d874c130..d4eaac064 100644
--- a/OpenCL/m03200-pure.cl
+++ b/OpenCL/m03200-pure.cl
@@ -376,7 +376,7 @@ DECLSPEC void expand_key (u32 *E, u32 *W, const int len)
   }
 }
 
-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
 {
   /**
    * base
@@ -587,7 +587,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03
   }
 }
 
-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
 {
   /**
    * base
@@ -779,7 +779,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03
   }
 }
 
-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t))
 {
   /**
    * base
diff --git a/OpenCL/m09000-pure.cl b/OpenCL/m09000-pure.cl
index 591cdc0cf..15db9287b 100644
--- a/OpenCL/m09000-pure.cl
+++ b/OpenCL/m09000-pure.cl
@@ -357,7 +357,7 @@ CONSTANT_VK u32a c_pbox[18] =
   L ^= P[17];           \
 }
 
-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
 {
   /**
    * base
@@ -576,7 +576,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09
   }
 }
 
-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t))
 {
   /**
    * base
diff --git a/OpenCL/m18600-pure.cl b/OpenCL/m18600-pure.cl
index 930b85f59..3ec64083e 100644
--- a/OpenCL/m18600-pure.cl
+++ b/OpenCL/m18600-pure.cl
@@ -586,7 +586,7 @@ KERNEL_FQ void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
   }
 }
 
-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
 {
   const u64 gid = get_global_id (0);
   const u64 lid = get_local_id (0);
diff --git a/include/backend.h b/include/backend.h
index e2556d7b2..074a0cd9f 100644
--- a/include/backend.h
+++ b/include/backend.h
@@ -68,7 +68,7 @@ int hc_cuMemcpyHtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDev
 int hc_cuMemFree                 (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
 int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
 int hc_cuModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-int hc_cuModuleLoadDataExLog     (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, const u64 threads_per_block);
+int hc_cuModuleLoadDataExLog     (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image);
 int hc_cuModuleUnload            (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
 int hc_cuStreamCreate            (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags);
 int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
diff --git a/src/backend.c b/src/backend.c
index 6fa9bb4e6..694bd8061 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -1297,15 +1297,15 @@ int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const v
   return 0;
 }
 
-int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, const u64 threads_per_block)
+int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image)
 {
   #define LOG_SIZE 8192
 
   char *info_log  = hcmalloc (LOG_SIZE);
   char *error_log = hcmalloc (LOG_SIZE);
 
-  CUjit_option opts[7];
-  void *vals[7];
+  CUjit_option opts[6];
+  void *vals[6];
 
   opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
   vals[0] = (void *) 0;
@@ -1325,17 +1325,7 @@ int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, cons
   opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
   vals[5] = (void *) LOG_SIZE;
 
-  int opts_cnt = 6;
-
-  if ((threads_per_block > 0) && (threads_per_block < 1024))
-  {
-    opts[6] = CU_JIT_THREADS_PER_BLOCK;
-    vals[6] = (void *) threads_per_block;
-
-    opts_cnt++;
-  }
-
-  const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, module, image, opts_cnt, opts, vals);
+  const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, module, image, 6, opts, vals);
 
   #if defined (DEBUG)
   printf ("cuModuleLoadDataEx() Info Log (%d):\n%s\n\n",  (int) strlen (info_log),  info_log);
@@ -7778,7 +7768,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
           if (rc_nvrtcDestroyProgram == -1) return -1;
 
-          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary, device_param->kernel_threads_max);
+          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary);
 
           if (rc_cuModuleLoadDataEx == -1) return -1;
 
@@ -7864,7 +7854,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
         if (device_param->is_cuda == true)
         {
-          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, kernel_sources[0], device_param->kernel_threads_max);
+          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, kernel_sources[0]);
 
           if (rc_cuModuleLoadDataEx == -1) return -1;
         }
@@ -8032,7 +8022,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
             // tbd: check for some useful options
 
-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary, 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary);
 
             if (rc_cuModuleLoadDataEx == -1) return -1;
 
@@ -8116,7 +8106,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
           if (device_param->is_cuda == true)
           {
-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, kernel_sources[0], 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, kernel_sources[0]);
 
             if (rc_cuModuleLoadDataEx == -1) return -1;
           }
@@ -8287,7 +8277,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
             // tbd: check for some useful options
 
-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary, 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary);
 
             if (rc_cuModuleLoadDataEx == -1) return -1;
 
@@ -8371,7 +8361,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
           if (device_param->is_cuda == true)
           {
-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, kernel_sources[0], 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, kernel_sources[0]);
 
             if (rc_cuModuleLoadDataEx == -1) return -1;
           }