From fb7bb045875470a4988b8fe985c87e6e4e6d004a Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sun, 2 Feb 2020 11:15:37 +0100 Subject: [PATCH] Do not use dynamic shared memory if dynamic_local_mem_size is a multiple of local_mem_size --- OpenCL/m03200-pure.cl | 24 ++++++------ src/backend.c | 76 ++++++++++++++++++++------------------ src/modules/module_03200.c | 55 ++++++++++++++++++--------- 3 files changed, 90 insertions(+), 65 deletions(-) diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl index af739345d..282e2d20b 100644 --- a/OpenCL/m03200-pure.cl +++ b/OpenCL/m03200-pure.cl @@ -461,10 +461,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - u32 *S0 = lm + (lid * 1024) + 0; - u32 *S1 = lm + (lid * 1024) + 256; - u32 *S2 = lm + (lid * 1024) + 512; - u32 *S3 = lm + (lid * 1024) + 768; + LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; + LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; + LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; + LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; @@ -626,10 +626,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - u32 *S0 = lm + (lid * 1024) + 0; - u32 *S1 = lm + (lid * 1024) + 256; - u32 *S2 = lm + (lid * 1024) + 512; - u32 *S3 = lm + (lid * 1024) + 768; + LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; + LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; + LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; + LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; @@ -818,10 +818,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS } #ifdef DYNAMIC_LOCAL - u32 *S0 = lm + (lid * 1024) + 0; - u32 *S1 = lm + (lid * 1024) + 256; - u32 *S2 = lm + (lid * 1024) + 512; - u32 *S3 = lm + (lid * 1024) + 768; + LOCAL_AS u32 *S0 = lm + (lid * 1024) + 0; + LOCAL_AS u32 *S1 = lm + (lid * 1024) + 256; + LOCAL_AS u32 *S2 = lm + (lid * 1024) + 512; + LOCAL_AS u32 *S3 = lm + (lid * 1024) + 768; #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; diff --git a/src/backend.c b/src/backend.c index f7d73058a..25f18f0d3 100644 --- a/src/backend.c +++ b/src/backend.c @@ -3275,14 +3275,13 @@ int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic device_param->kernel_params_atinit[0] = (void *) &buf; device_param->kernel_params_atinit_buf64[1] = num_elements; - const u64 kernel_threads = device_param->kernel_wgs_atinit; - const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_atinit; + const u64 kernel_threads = device_param->kernel_wgs_atinit; num_elements = CEILDIV (num_elements, kernel_threads); CUfunction function = device_param->cuda_function_atinit; - if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_atinit, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_atinit, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; @@ -3300,8 +3299,7 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic device_param->kernel_params_memset_buf32[1] = value; device_param->kernel_params_memset_buf64[2] = num16d; - const u64 kernel_threads = device_param->kernel_wgs_memset; - const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_memset; + const u64 kernel_threads = device_param->kernel_wgs_memset; u64 num_elements = num16d; @@ -3316,7 +3314,7 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic //const size_t global_work_size[3] = { num_elements, 1, 1 }; //const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3484,6 +3482,18 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con break; } + if (device_param->is_cuda == true) + { + if ((device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) == 0) + { + // this is the case Compute Capability 7.5 + // there is also Compute Capability 7.0 which offers a larger dynamic local size access + // however, if it's an exact multiple the driver can optimize this for us more efficient + + dynamic_shared_mem = 0; + } + } + kernel_threads = MIN (kernel_threads, device_param->kernel_threads); device_param->kernel_params_buf64[34] = num; @@ -3511,6 +3521,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con case KERN_RUN_AUX3: cuda_function = device_param->cuda_function_aux3; break; case KERN_RUN_AUX4: cuda_function = device_param->cuda_function_aux4; break; } + + if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1; } if (kernel_threads == 0) kernel_threads = 1; @@ -3767,23 +3779,13 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num) { - u64 kernel_threads = 0; - u64 dynamic_shared_mem = 0; + u64 kernel_threads = 0; switch (kern_run) { - case KERN_RUN_MP: - kernel_threads = device_param->kernel_wgs_mp; - dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_mp; - break; - case KERN_RUN_MP_R: - kernel_threads = device_param->kernel_wgs_mp_r; - dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_mp_r; - break; - case KERN_RUN_MP_L: - kernel_threads = device_param->kernel_wgs_mp_l; - dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_mp_l; - break; + case KERN_RUN_MP: kernel_threads = device_param->kernel_wgs_mp; break; + case KERN_RUN_MP_R: kernel_threads = device_param->kernel_wgs_mp_r; break; + case KERN_RUN_MP_L: kernel_threads = device_param->kernel_wgs_mp_l; break; } u64 num_elements = num; @@ -3816,7 +3818,7 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, num_elements = CEILDIV (num_elements, kernel_threads); - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, cuda_args, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, cuda_args, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3875,8 +3877,7 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { - const u64 num_elements = 1024; // fixed - const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_tm; + const u64 num_elements = 1024; // fixed const u64 kernel_threads = MIN (num_elements, device_param->kernel_wgs_tm); @@ -3884,7 +3885,7 @@ int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { CUfunction cuda_function = device_param->cuda_function_tm; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_tm, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_tm, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3912,8 +3913,7 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u64 num_elements = num; - const u64 kernel_threads = device_param->kernel_wgs_amp; - const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_amp; + const u64 kernel_threads = device_param->kernel_wgs_amp; if (device_param->is_cuda == true) { @@ -3921,7 +3921,7 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUfunction cuda_function = device_param->cuda_function_amp; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_amp, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_amp, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3953,8 +3953,7 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device u64 num_elements = num; - const u64 kernel_threads = device_param->kernel_wgs_decompress; - const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_decompress; + const u64 kernel_threads = device_param->kernel_wgs_decompress; if (device_param->is_cuda == true) { @@ -3962,7 +3961,7 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device CUfunction cuda_function = device_param->cuda_function_decompress; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_decompress, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_decompress, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -6806,7 +6805,9 @@ static int get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, C #define MAX_ASSUMED_SHARED (1024 * 1024) - for (int i = 0; i < MAX_ASSUMED_SHARED; i++) + u64 dynamic_shared_size_bytes = 0; + + for (int i = 1; i <= MAX_ASSUMED_SHARED; i++) { backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; @@ -6814,16 +6815,19 @@ static int get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, C const CUresult CU_err = cuda->cuFuncSetAttribute (function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, i); - if (CU_err == CUDA_SUCCESS) continue; + if (CU_err == CUDA_SUCCESS) + { + dynamic_shared_size_bytes = i; + + continue; + } break; } - int dynamic_shared_size_bytes = 0; + *result = dynamic_shared_size_bytes; - if (hc_cuFuncGetAttribute (hashcat_ctx, &dynamic_shared_size_bytes, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, function) == -1) return -1; - - *result = (u64) dynamic_shared_size_bytes; + if (hc_cuFuncSetAttribute (hashcat_ctx, function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 0) == -1) return -1; return 0; } diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c index 31c099730..3c528fed0 100644 --- a/src/modules/module_03200.c +++ b/src/modules/module_03200.c @@ -81,18 +81,32 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY { char *jit_build_options = NULL; + // this mode heavily depends on the available shared memory size + // note the kernel need to have some special code changes in order to make use to use post-48k memory region + // we need to set some macros + + bool use_dynamic = false; + + if (device_param->is_cuda == true) + { + if (device_param->kernel_dynamic_local_mem_size_memset % device_param->device_local_mem_size) + { + // this is the case Compute Capability 7.5 + // there is also Compute Capability 7.0 which offers a larger dynamic local size access + // however, if it's an exact multiple the driver can optimize this for us more efficient + + use_dynamic = true; + } + } + // this uses some nice feedback effect. // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result. // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1. - u32 fixed_local_size = 0; - if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) { - fixed_local_size = 1; - - hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", 1); } else { @@ -108,45 +122,52 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY if (device_param->is_opencl == true) { - overhead = 4; + overhead = 1; } } if (user_options->kernel_threads_chgd == true) { - fixed_local_size = user_options->kernel_threads; + u32 fixed_local_size = user_options->kernel_threads; - // otherwise out-of-bound reads - - if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead)) + if (use_dynamic == true) { - fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; - } + if ((fixed_local_size * 4096) > device_param->kernel_dynamic_local_mem_size_memset) + { + // otherwise out-of-bound reads + + fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096; + } - if (device_param->is_cuda == true) - { hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size); } else { + if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead)) + { + // otherwise out-of-bound reads + + fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + } + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); } } else { - if (device_param->is_cuda == true) + if (use_dynamic == true) { // using kernel_dynamic_local_mem_size_memset is a bit hackish. // we had to brute-force this value out of an already loaded CUDA function. // there's no official way to query for this value. - fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096; + const u32 fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096; hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size); } else { - fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + const u32 fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); }