From 5e0eb288c90f559d27861eb67d4652f7372ce00b Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sun, 16 Jun 2019 18:01:26 +0200 Subject: [PATCH] Use __launch_bounds__ in CUDA as replacement for reqd_work_group_size() in OpenCL --- OpenCL/inc_platform.cl | 3 +++ OpenCL/m03200-pure.cl | 6 +++--- OpenCL/m09000-pure.cl | 4 ++-- OpenCL/m18600-pure.cl | 2 +- include/backend.h | 2 +- src/backend.c | 30 ++++++++++-------------------- 6 files changed, 20 insertions(+), 27 deletions(-) diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl index 16761cc27..b3ee39697 100644 --- a/OpenCL/inc_platform.cl +++ b/OpenCL/inc_platform.cl @@ -8,6 +8,7 @@ #include "inc_platform.h" #ifdef IS_NATIVE +#define FIXED_THREAD_COUNT(n) #define SYNC_THREADS() #endif @@ -107,9 +108,11 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n) return ((a >> n) | ((a << (64 - n)))); } +#define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0) #define SYNC_THREADS() __syncthreads () #endif #ifdef IS_OPENCL +#define FIXED_THREAD_COUNT(n) __attribute__((reqd_work_group_size((n), 1, 1))) #define SYNC_THREADS() barrier (CLK_LOCAL_MEM_FENCE) #endif diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl index 7d874c130..d4eaac064 100644 --- a/OpenCL/m03200-pure.cl +++ b/OpenCL/m03200-pure.cl @@ -376,7 +376,7 @@ DECLSPEC void expand_key (u32 *E, u32 *W, const int len) } } -KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t)) +KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t)) { /** * base @@ -587,7 +587,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03 } } -KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t)) +KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t)) { /** * base @@ -779,7 +779,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03 } } -KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t)) +KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t)) { /** * base diff --git a/OpenCL/m09000-pure.cl b/OpenCL/m09000-pure.cl index 591cdc0cf..15db9287b 100644 --- a/OpenCL/m09000-pure.cl +++ b/OpenCL/m09000-pure.cl @@ -357,7 +357,7 @@ CONSTANT_VK u32a c_pbox[18] = L ^= P[17]; \ } -KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t)) +KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t)) { /** * base @@ -576,7 +576,7 @@ KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09 } } -KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t)) +KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t)) { /** * base diff --git a/OpenCL/m18600-pure.cl b/OpenCL/m18600-pure.cl index 930b85f59..3ec64083e 100644 --- a/OpenCL/m18600-pure.cl +++ b/OpenCL/m18600-pure.cl @@ -586,7 +586,7 @@ KERNEL_FQ void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t)) } } -KERNEL_FQ void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t)) +KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t)) { const u64 gid = get_global_id (0); const u64 lid = get_local_id (0); diff --git a/include/backend.h b/include/backend.h index e2556d7b2..074a0cd9f 100644 --- a/include/backend.h +++ b/include/backend.h @@ -68,7 +68,7 @@ int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDev int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr); int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name); int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); -int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, const u64 threads_per_block); +int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image); int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod); int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags); int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream); diff --git a/src/backend.c b/src/backend.c index 6fa9bb4e6..694bd8061 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1297,15 +1297,15 @@ int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const v return 0; } -int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, const u64 threads_per_block) +int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image) { #define LOG_SIZE 8192 char *info_log = hcmalloc (LOG_SIZE); char *error_log = hcmalloc (LOG_SIZE); - CUjit_option opts[7]; - void *vals[7]; + CUjit_option opts[6]; + void *vals[6]; opts[0] = CU_JIT_TARGET_FROM_CUCONTEXT; vals[0] = (void *) 0; @@ -1325,17 +1325,7 @@ int hc_cuModuleLoadDataExLog (hashcat_ctx_t *hashcat_ctx, CUmodule *module, cons opts[5] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; vals[5] = (void *) LOG_SIZE; - int opts_cnt = 6; - - if ((threads_per_block > 0) && (threads_per_block < 1024)) - { - opts[6] = CU_JIT_THREADS_PER_BLOCK; - vals[6] = (void *) threads_per_block; - - opts_cnt++; - } - - const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, module, image, opts_cnt, opts, vals); + const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataEx (hashcat_ctx, module, image, 6, opts, vals); #if defined (DEBUG) printf ("cuModuleLoadDataEx() Info Log (%d):\n%s\n\n", (int) strlen (info_log), info_log); @@ -7778,7 +7768,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (rc_nvrtcDestroyProgram == -1) return -1; - const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary, device_param->kernel_threads_max); + const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, binary); if (rc_cuModuleLoadDataEx == -1) return -1; @@ -7864,7 +7854,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (device_param->is_cuda == true) { - const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, kernel_sources[0], device_param->kernel_threads_max); + const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module, kernel_sources[0]); if (rc_cuModuleLoadDataEx == -1) return -1; } @@ -8032,7 +8022,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // tbd: check for some useful options - const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary, 0); + const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, binary); if (rc_cuModuleLoadDataEx == -1) return -1; @@ -8116,7 +8106,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (device_param->is_cuda == true) { - const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, kernel_sources[0], 0); + const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_mp, kernel_sources[0]); if (rc_cuModuleLoadDataEx == -1) return -1; } @@ -8287,7 +8277,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // tbd: check for some useful options - const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary, 0); + const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, binary); if (rc_cuModuleLoadDataEx == -1) return -1; @@ -8371,7 +8361,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (device_param->is_cuda == true) { - const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, kernel_sources[0], 0); + const int rc_cuModuleLoadDataEx = hc_cuModuleLoadDataExLog (hashcat_ctx, &device_param->cuda_module_amp, kernel_sources[0]); if (rc_cuModuleLoadDataEx == -1) return -1; }