Use __launch_bounds__ in CUDA as replacement for reqd_work_group_size() in OpenCL

2025-07-30 10:28:41 +00:00 · 2019-06-16 18:01:26 +02:00 · 2019-06-16 18:01:26 +02:00 · 5e0eb288c9
commit 5e0eb288c9
parent dbbdb7e5ac
6 changed files with 20 additions and 27 deletions
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@ -8,6 +8,7 @@
 #include "inc_platform.h"

 #ifdef IS_NATIVE
+#define FIXED_THREAD_COUNT(n)
 #define SYNC_THREADS()
 #endif

@ -107,9 +108,11 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
  return ((a >> n) | ((a << (64 - n))));
 }

+#define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0)
 #define SYNC_THREADS() __syncthreads ()
 #endif

 #ifdef IS_OPENCL
+#define FIXED_THREAD_COUNT(n) __attribute__((reqd_work_group_size((n), 1, 1)))
 #define SYNC_THREADS() barrier (CLK_LOCAL_MEM_FENCE)
 #endif
--- a/OpenCL/m03200-pure.cl
+++ b/OpenCL/m03200-pure.cl
@ -376,7 +376,7 @@ DECLSPEC void expand_key (u32 *E, u32 *W, const int len)
  }
 }

-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
 {
  /**
   * base
@ -587,7 +587,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03
  }
 }

-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
 {
  /**
   * base
@ -779,7 +779,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03
  }
 }

-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t))
 {
  /**
   * base
--- a/OpenCL/m09000-pure.cl
+++ b/OpenCL/m09000-pure.cl
@ -357,7 +357,7 @@ CONSTANT_VK u32a c_pbox[18] =
  L ^= P[17];           \
 }

-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
 {
  /**
   * base
@ -576,7 +576,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09
  }
 }

-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t))
 {
  /**
   * base
--- a/OpenCL/m18600-pure.cl
+++ b/OpenCL/m18600-pure.cl
@ -586,7 +586,7 @@ KERNEL_FQ void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
  }
 }

-KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
+KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
 {
  const u64 gid = get_global_id (0);
  const u64 lid = get_local_id (0);
--- a/include/backend.h
+++ b/include/backend.h
@ -68,7 +68,7 @@ int hc_cuMemcpyHtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDev
 int hc_cuMemFree                 (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
 int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
 int hc_cuModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-int hc_cuModuleLoadDataExLog     (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, const u64 threads_per_block);
+int hc_cuModuleLoadDataExLog     (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image);
 int hc_cuModuleUnload            (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
 int hc_cuStreamCreate            (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags);
 int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
--- a/src/backend.c
+++ b/src/backend.c
@ -1297,15 +1297,15 @@ int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const v
  return 0;
 }

-int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, const u64 threads_per_block)
+int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image)
 {
  #define LOG_SIZE 8192

  char *info_log  = hcmalloc (LOG_SIZE);
  char *error_log = hcmalloc (LOG_SIZE);

-  CUjit_option opts[7];
-  void *vals[7];
+  CUjit_option opts[6];
+  void *vals[6];

  opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT;
  vals[0] = (void *) 0;
@ -1325,17 +1325,7 @@ int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, cons
  opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  vals[5] = (void *) LOG_SIZE;

-  int opts_cnt = 6;
-
-  if ((threads_per_block > 0) && (threads_per_block < 1024))
-  {
-    opts[6] = CU_JIT_THREADS_PER_BLOCK;
-    vals[6] = (void *) threads_per_block;
-
-    opts_cnt++;
-  }
-
-  const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, module, image, opts_cnt, opts, vals);
+  const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, module, image, 6, opts, vals);

  #if defined (DEBUG)
  printf ("cuModuleLoadDataEx() Info Log (%d):\n%s\n\n",  (int) strlen (info_log),  info_log);
@ -7778,7 +7768,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

          if (rc_nvrtcDestroyProgram == -1) return -1;

-          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary, device_param->kernel_threads_max);
+          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary);

          if (rc_cuModuleLoadDataEx == -1) return -1;

@ -7864,7 +7854,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

        if (device_param->is_cuda == true)
        {
-          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, kernel_sources[0], device_param->kernel_threads_max);
+          const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, kernel_sources[0]);

          if (rc_cuModuleLoadDataEx == -1) return -1;
        }
@ -8032,7 +8022,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

            // tbd: check for some useful options

-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary, 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary);

            if (rc_cuModuleLoadDataEx == -1) return -1;

@ -8116,7 +8106,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

          if (device_param->is_cuda == true)
          {
-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, kernel_sources[0], 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, kernel_sources[0]);

            if (rc_cuModuleLoadDataEx == -1) return -1;
          }
@ -8287,7 +8277,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

            // tbd: check for some useful options

-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary, 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary);

            if (rc_cuModuleLoadDataEx == -1) return -1;

@ -8371,7 +8361,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

          if (device_param->is_cuda == true)
          {
-            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, kernel_sources[0], 0);
+            const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, kernel_sources[0]);

            if (rc_cuModuleLoadDataEx == -1) return -1;
          }