diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl index d4eaac064..af739345d 100644 --- a/OpenCL/m03200-pure.cl +++ b/OpenCL/m03200-pure.cl @@ -356,6 +356,10 @@ CONSTANT_VK u32a c_pbox[18] = L ^= P[17]; \ } +#ifdef DYNAMIC_LOCAL +extern __shared__ u32 lm[]; +#endif + DECLSPEC void expand_key (u32 *E, u32 *W, const int len) { u8 *E_ptr = (u8 *) E; @@ -456,6 +460,12 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS P[i] = c_pbox[i]; } + #ifdef DYNAMIC_LOCAL + u32 *S0 = lm + (lid * 1024) + 0; + u32 *S1 = lm + (lid * 1024) + 256; + u32 *S2 = lm + (lid * 1024) + 512; + u32 *S3 = lm + (lid * 1024) + 768; + #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; @@ -465,6 +475,7 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_init (KERN_ATTR_TMPS LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; LOCAL_AS u32 *S3 = S3_all[lid]; + #endif for (u32 i = 0; i < 256; i++) { @@ -614,6 +625,12 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS P[i] = tmps[gid].P[i]; } + #ifdef DYNAMIC_LOCAL + u32 *S0 = lm + (lid * 1024) + 0; + u32 *S1 = lm + (lid * 1024) + 256; + u32 *S2 = lm + (lid * 1024) + 512; + u32 *S3 = lm + (lid * 1024) + 768; + #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; @@ -623,6 +640,7 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_loop (KERN_ATTR_TMPS LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; LOCAL_AS u32 *S3 = S3_all[lid]; + #endif for (u32 i = 0; i < 256; i++) { @@ -799,6 +817,12 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS P[i] = tmps[gid].P[i]; } + #ifdef DYNAMIC_LOCAL + u32 *S0 = lm + (lid * 1024) + 0; + u32 *S1 = lm + (lid * 1024) + 256; + u32 *S2 = lm + (lid * 1024) + 512; + u32 *S3 = lm + (lid * 1024) + 768; + #else LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256]; LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256]; @@ -808,6 +832,7 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m03200_comp (KERN_ATTR_TMPS LOCAL_AS u32 *S1 = S1_all[lid]; LOCAL_AS u32 *S2 = S2_all[lid]; LOCAL_AS u32 *S3 = S3_all[lid]; + #endif for (u32 i = 0; i < 256; i++) { diff --git a/docs/changes.txt b/docs/changes.txt index 6f48ef0ca..957117e86 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -115,6 +115,7 @@ - OpenCL Runtime: Workaround JiT compiler error on ROCm 2.3 driver if the 'inline' keyword is used in function declaration - OpenCL Runtime: Workaround memory allocation error on AMD driver on Windows leading to CL_MEM_OBJECT_ALLOCATION_FAILURE - OpenCL Runtime: Workaround ROCm OpenCL driver problem trying to write temporary file into readonly folder by setting TMPDIR +- OpenCL Runtime: Allow the kernel to access post-48k shared memory region on CUDA. Requires both module and kernel preparation - Startup Checks: Improved the pidfile check: Do not just check for existing PID but also check executable filename - Startup Checks: Prevent the user to modify options which are overwritten automatically in benchmark mode - Startup Screen: Add extra warning when using --force diff --git a/include/types.h b/include/types.h index 3bd742549..51d23b57f 100644 --- a/include/types.h +++ b/include/types.h @@ -1132,6 +1132,27 @@ typedef struct hc_device_param u64 kernel_local_mem_size_aux3; u64 kernel_local_mem_size_aux4; + u64 kernel_dynamic_local_mem_size1; + u64 kernel_dynamic_local_mem_size12; + u64 kernel_dynamic_local_mem_size2; + u64 kernel_dynamic_local_mem_size23; + u64 kernel_dynamic_local_mem_size3; + u64 kernel_dynamic_local_mem_size4; + u64 kernel_dynamic_local_mem_size_init2; + u64 kernel_dynamic_local_mem_size_loop2; + u64 kernel_dynamic_local_mem_size_mp; + u64 kernel_dynamic_local_mem_size_mp_l; + u64 kernel_dynamic_local_mem_size_mp_r; + u64 kernel_dynamic_local_mem_size_amp; + u64 kernel_dynamic_local_mem_size_tm; + u64 kernel_dynamic_local_mem_size_memset; + u64 kernel_dynamic_local_mem_size_atinit; + u64 kernel_dynamic_local_mem_size_decompress; + u64 kernel_dynamic_local_mem_size_aux1; + u64 kernel_dynamic_local_mem_size_aux2; + u64 kernel_dynamic_local_mem_size_aux3; + u64 kernel_dynamic_local_mem_size_aux4; + u32 kernel_accel; u32 kernel_accel_prev; u32 kernel_accel_min; diff --git a/src/backend.c b/src/backend.c index 4803f8688..91d8ddd5d 100644 --- a/src/backend.c +++ b/src/backend.c @@ -3275,13 +3275,14 @@ int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic device_param->kernel_params_atinit[0] = (void *) &buf; device_param->kernel_params_atinit_buf64[1] = num_elements; - const u64 kernel_threads = device_param->kernel_wgs_atinit; + const u64 kernel_threads = device_param->kernel_wgs_atinit; + const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_atinit; num_elements = CEILDIV (num_elements, kernel_threads); CUfunction function = device_param->cuda_function_atinit; - if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_atinit, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_atinit, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; @@ -3299,7 +3300,8 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic device_param->kernel_params_memset_buf32[1] = value; device_param->kernel_params_memset_buf64[2] = num16d; - const u64 kernel_threads = device_param->kernel_wgs_memset; + const u64 kernel_threads = device_param->kernel_wgs_memset; + const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_memset; u64 num_elements = num16d; @@ -3314,7 +3316,7 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic //const size_t global_work_size[3] = { num_elements, 1, 1 }; //const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3428,21 +3430,58 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con const user_options_t *user_options = hashcat_ctx->user_options; u64 kernel_threads = 0; + u64 dynamic_shared_mem = 0; switch (kern_run) { - case KERN_RUN_1: kernel_threads = device_param->kernel_wgs1; break; - case KERN_RUN_12: kernel_threads = device_param->kernel_wgs12; break; - case KERN_RUN_2: kernel_threads = device_param->kernel_wgs2; break; - case KERN_RUN_23: kernel_threads = device_param->kernel_wgs23; break; - case KERN_RUN_3: kernel_threads = device_param->kernel_wgs3; break; - case KERN_RUN_4: kernel_threads = device_param->kernel_wgs4; break; - case KERN_RUN_INIT2: kernel_threads = device_param->kernel_wgs_init2; break; - case KERN_RUN_LOOP2: kernel_threads = device_param->kernel_wgs_loop2; break; - case KERN_RUN_AUX1: kernel_threads = device_param->kernel_wgs_aux1; break; - case KERN_RUN_AUX2: kernel_threads = device_param->kernel_wgs_aux2; break; - case KERN_RUN_AUX3: kernel_threads = device_param->kernel_wgs_aux3; break; - case KERN_RUN_AUX4: kernel_threads = device_param->kernel_wgs_aux4; break; + case KERN_RUN_1: + kernel_threads = device_param->kernel_wgs1; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size1; + break; + case KERN_RUN_12: + kernel_threads = device_param->kernel_wgs12; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size12; + break; + case KERN_RUN_2: + kernel_threads = device_param->kernel_wgs2; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2; + break; + case KERN_RUN_23: + kernel_threads = device_param->kernel_wgs23; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size23; + break; + case KERN_RUN_3: + kernel_threads = device_param->kernel_wgs3; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size3; + break; + case KERN_RUN_4: + kernel_threads = device_param->kernel_wgs4; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size4; + break; + case KERN_RUN_INIT2: + kernel_threads = device_param->kernel_wgs_init2; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_init2; + break; + case KERN_RUN_LOOP2: + kernel_threads = device_param->kernel_wgs_loop2; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2; + break; + case KERN_RUN_AUX1: + kernel_threads = device_param->kernel_wgs_aux1; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux1; + break; + case KERN_RUN_AUX2: + kernel_threads = device_param->kernel_wgs_aux2; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux2; + break; + case KERN_RUN_AUX3: + kernel_threads = device_param->kernel_wgs_aux3; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux3; + break; + case KERN_RUN_AUX4: + kernel_threads = device_param->kernel_wgs_aux4; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_aux4; + break; } kernel_threads = MIN (kernel_threads, device_param->kernel_threads); @@ -3482,7 +3521,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con { if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 32, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 32, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1; if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1; } @@ -3512,7 +3551,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1; if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1; } @@ -3728,13 +3767,23 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num) { - u64 kernel_threads = 0; + u64 kernel_threads = 0; + u64 dynamic_shared_mem = 0; switch (kern_run) { - case KERN_RUN_MP: kernel_threads = device_param->kernel_wgs_mp; break; - case KERN_RUN_MP_R: kernel_threads = device_param->kernel_wgs_mp_r; break; - case KERN_RUN_MP_L: kernel_threads = device_param->kernel_wgs_mp_l; break; + case KERN_RUN_MP: + kernel_threads = device_param->kernel_wgs_mp; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_mp; + break; + case KERN_RUN_MP_R: + kernel_threads = device_param->kernel_wgs_mp_r; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_mp_r; + break; + case KERN_RUN_MP_L: + kernel_threads = device_param->kernel_wgs_mp_l; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_mp_l; + break; } u64 num_elements = num; @@ -3767,7 +3816,7 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, num_elements = CEILDIV (num_elements, kernel_threads); - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, cuda_args, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, cuda_args, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3826,7 +3875,8 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { - const u64 num_elements = 1024; // fixed + const u64 num_elements = 1024; // fixed + const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_tm; const u64 kernel_threads = MIN (num_elements, device_param->kernel_wgs_tm); @@ -3834,7 +3884,7 @@ int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { CUfunction cuda_function = device_param->cuda_function_tm; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_tm, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements / kernel_threads, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_tm, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3862,7 +3912,8 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u64 num_elements = num; - const u64 kernel_threads = device_param->kernel_wgs_amp; + const u64 kernel_threads = device_param->kernel_wgs_amp; + const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_amp; if (device_param->is_cuda == true) { @@ -3870,7 +3921,7 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUfunction cuda_function = device_param->cuda_function_amp; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_amp, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_amp, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -3902,7 +3953,8 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device u64 num_elements = num; - const u64 kernel_threads = device_param->kernel_wgs_decompress; + const u64 kernel_threads = device_param->kernel_wgs_decompress; + const u64 dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_decompress; if (device_param->is_cuda == true) { @@ -3910,7 +3962,7 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device CUfunction cuda_function = device_param->cuda_function_decompress; - if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_decompress, NULL) == -1) return -1; + if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params_decompress, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; } @@ -6739,6 +6791,35 @@ static int get_cuda_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, CUfunctio return 0; } +static int get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, CUfunction function, u64 *result) +{ + // AFAIK there's no way to query the maximum value for dynamic shared memory available (because it depends on kernel code). + // let's brute force it, therefore workaround the hashcat wrapper of cuFuncSetAttribute() + + #define MAX_ASSUMED_SHARED (1024 * 1024) + + for (int i = 0; i < MAX_ASSUMED_SHARED; i++) + { + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda; + + const CUresult CU_err = cuda->cuFuncSetAttribute (function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, i); + + if (CU_err == CUDA_SUCCESS) continue; + + break; + } + + int dynamic_shared_size_bytes = 0; + + if (hc_cuFuncGetAttribute (hashcat_ctx, &dynamic_shared_size_bytes, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, function) == -1) return -1; + + *result = (u64) dynamic_shared_size_bytes; + + return 0; +} + static int get_opencl_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u32 *result) { size_t work_group_size = 0; @@ -6785,6 +6866,23 @@ static int get_opencl_kernel_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_devi return 0; } +static int get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_kernel kernel, u64 *result) +{ + cl_ulong dynamic_local_mem_size = 0; + + if (hc_clGetKernelWorkGroupInfo (hashcat_ctx, kernel, device_param->opencl_device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (dynamic_local_mem_size), &dynamic_local_mem_size, NULL) == -1) return -1; + + // unknown how to query this information in OpenCL + // we therefore reset to zero + // the above call to hc_clGetKernelWorkGroupInfo() is just to avoid compiler warnings + + dynamic_local_mem_size = 0; + + *result = dynamic_local_mem_size; + + return 0; +} + static u32 get_kernel_threads (const hc_device_param_t *device_param) { // this is an upper limit, a good start, since our strategy is to reduce thread counts only. @@ -7650,38 +7748,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } */ - char *build_options_module_buf = (char *) hcmalloc (build_options_sz); - - int build_options_module_len = 0; - - build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s ", build_options_buf); - - if (module_ctx->module_jit_build_options != MODULE_DEFAULT) - { - char *jit_build_options = module_ctx->module_jit_build_options (hashconfig, user_options, user_options_extra, hashes, device_param); - - if (jit_build_options != NULL) - { - build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options); - - // this is a bit ugly - // would be better to have the module return the value as value - - u32 fixed_local_size = 0; - - if (sscanf (jit_build_options, "-D FIXED_LOCAL_SIZE=%u", &fixed_local_size) == 1) - { - device_param->kernel_threads_min = fixed_local_size; - device_param->kernel_threads_max = fixed_local_size; - } - } - } - - build_options_module_buf[build_options_module_len] = 0; - #if defined (DEBUG) if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options '%s'", device_id + 1, build_options_buf); - if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options_module '%s'", device_id + 1, build_options_module_buf); #endif /** @@ -7748,22 +7816,17 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } /** - * Prepare some logging buffer (CUDA only) but we need to do it on this level of the scope - * Other backends just dont use this - */ - - /** - * main kernel + * shared kernel with no hashconfig dependencies */ { /** - * kernel source filename + * kernel shared source filename */ char source_file[256] = { 0 }; - generate_source_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->shared_dir, source_file); + generate_source_kernel_shared_filename (folder_config->shared_dir, source_file); if (hc_path_read (source_file) == false) { @@ -7773,18 +7836,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } /** - * kernel cached filename + * kernel shared cached filename */ char cached_file[256] = { 0 }; - generate_cached_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->profile_dir, device_name_chksum, cached_file); - - /** - * load kernel - */ + generate_cached_kernel_shared_filename (folder_config->profile_dir, device_name_chksum_amp_mp, cached_file); - const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "main_kernel", source_file, cached_file, build_options_module_buf, cache_disable, &device_param->opencl_program, &device_param->cuda_module); + const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "shared_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_shared, &device_param->cuda_module_shared); if (rc_load_kernel == false) { @@ -7792,22 +7851,138 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) return -1; } - } - hcfree (build_options_module_buf); + if (device_param->is_cuda == true) + { + // GPU memset + + if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_memset, device_param->cuda_module_shared, "gpu_memset") == -1) return -1; + + if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_wgs_memset) == -1) return -1; + + if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1; + + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_dynamic_local_mem_size_memset) == -1) return -1; + + device_param->kernel_preferred_wgs_multiple_memset = device_param->cuda_warp_size; + + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1; + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1; + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1; + + // GPU autotune init + + if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_atinit, device_param->cuda_module_shared, "gpu_atinit") == -1) return -1; + + if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_wgs_atinit) == -1) return -1; + + if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1; + + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_dynamic_local_mem_size_atinit) == -1) return -1; + + device_param->kernel_preferred_wgs_multiple_atinit = device_param->cuda_warp_size; + + // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1; + // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1; + + // GPU decompress + + if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_decompress, device_param->cuda_module_shared, "gpu_decompress") == -1) return -1; + + if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_wgs_decompress) == -1) return -1; + + if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1; + + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1; + + device_param->kernel_preferred_wgs_multiple_decompress = device_param->cuda_warp_size; + } + + if (device_param->is_opencl == true) + { + // GPU memset + + if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_memset", &device_param->opencl_kernel_memset) == -1) return -1; + + if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_wgs_memset) == -1) return -1; + + if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1; + + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_dynamic_local_mem_size_memset) == -1) return -1; + + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_preferred_wgs_multiple_memset) == -1) return -1; + + // GPU autotune init + + if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1; + + if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_wgs_atinit) == -1) return -1; + + if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1; + + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_dynamic_local_mem_size_atinit) == -1) return -1; + + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_preferred_wgs_multiple_atinit) == -1) return -1; + + // GPU decompress + + if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_decompress", &device_param->opencl_kernel_decompress) == -1) return -1; + + if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_wgs_decompress) == -1) return -1; + + if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1; + + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1; + + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress) == -1) return -1; + } + } /** - * shared kernel with no hashconfig dependencies + * main kernel */ { + char *build_options_module_buf = (char *) hcmalloc (build_options_sz); + + int build_options_module_len = 0; + + build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s ", build_options_buf); + + if (module_ctx->module_jit_build_options != MODULE_DEFAULT) + { + char *jit_build_options = module_ctx->module_jit_build_options (hashconfig, user_options, user_options_extra, hashes, device_param); + + if (jit_build_options != NULL) + { + build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options); + + // this is a bit ugly + // would be better to have the module return the value as value + + u32 fixed_local_size = 0; + + if (sscanf (jit_build_options, "-D FIXED_LOCAL_SIZE=%u", &fixed_local_size) == 1) + { + device_param->kernel_threads_min = fixed_local_size; + device_param->kernel_threads_max = fixed_local_size; + } + } + } + + build_options_module_buf[build_options_module_len] = 0; + + #if defined (DEBUG) + if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_options_module '%s'", device_id + 1, build_options_module_buf); + #endif + /** - * kernel shared source filename + * kernel source filename */ char source_file[256] = { 0 }; - generate_source_kernel_shared_filename (folder_config->shared_dir, source_file); + generate_source_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->shared_dir, source_file); if (hc_path_read (source_file) == false) { @@ -7817,14 +7992,18 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } /** - * kernel shared cached filename + * kernel cached filename */ char cached_file[256] = { 0 }; - generate_cached_kernel_shared_filename (folder_config->profile_dir, device_name_chksum_amp_mp, cached_file); + generate_cached_kernel_filename (user_options->slow_candidates, hashconfig->attack_exec, user_options_extra->attack_kern, kern_type, hashconfig->opti_type, folder_config->profile_dir, device_name_chksum, cached_file); - const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "shared_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_shared, &device_param->cuda_module_shared); + /** + * load kernel + */ + + const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "main_kernel", source_file, cached_file, build_options_module_buf, cache_disable, &device_param->opencl_program, &device_param->cuda_module); if (rc_load_kernel == false) { @@ -7832,6 +8011,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) return -1; } + + hcfree (build_options_module_buf); } /** @@ -8480,6 +8661,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1; + device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size; // kernel2 @@ -8492,6 +8675,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1; + device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size; // kernel3 @@ -8504,6 +8689,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1; + device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size; } else @@ -8516,6 +8703,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1; + device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size; } } @@ -8533,6 +8722,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1; + device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size; // kernel2 @@ -8545,6 +8736,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1; + device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size; // kernel3 @@ -8557,6 +8750,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1; + device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size; } else @@ -8569,6 +8764,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_local_mem_size4) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1; + device_param->kernel_preferred_wgs_multiple4 = device_param->cuda_warp_size; } } @@ -8590,6 +8787,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_local_mem_size_tm) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_tm, &device_param->kernel_dynamic_local_mem_size_tm) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_tm = device_param->cuda_warp_size; } } @@ -8607,6 +8806,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_local_mem_size1) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1; + device_param->kernel_preferred_wgs_multiple1 = device_param->cuda_warp_size; // kernel2 @@ -8619,6 +8820,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_local_mem_size2) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1; + device_param->kernel_preferred_wgs_multiple2 = device_param->cuda_warp_size; // kernel3 @@ -8631,6 +8834,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_local_mem_size3) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1; + device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size; // kernel12 @@ -8645,6 +8850,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_local_mem_size12) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function12, &device_param->kernel_dynamic_local_mem_size12) == -1) return -1; + device_param->kernel_preferred_wgs_multiple12 = device_param->cuda_warp_size; } @@ -8660,6 +8867,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_local_mem_size23) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function23, &device_param->kernel_dynamic_local_mem_size23) == -1) return -1; + device_param->kernel_preferred_wgs_multiple23 = device_param->cuda_warp_size; } @@ -8675,6 +8884,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_local_mem_size_init2) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_init2, &device_param->kernel_dynamic_local_mem_size_init2) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_init2 = device_param->cuda_warp_size; } @@ -8690,6 +8901,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_local_mem_size_loop2) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2, &device_param->kernel_dynamic_local_mem_size_loop2) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_loop2 = device_param->cuda_warp_size; } @@ -8705,6 +8918,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_local_mem_size_aux1) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux1, &device_param->kernel_dynamic_local_mem_size_aux1) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_aux1 = device_param->cuda_warp_size; } @@ -8720,6 +8935,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_local_mem_size_aux2) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux2, &device_param->kernel_dynamic_local_mem_size_aux2) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_aux2 = device_param->cuda_warp_size; } @@ -8735,6 +8952,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_local_mem_size_aux3) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux3, &device_param->kernel_dynamic_local_mem_size_aux3) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_aux3 = device_param->cuda_warp_size; } @@ -8750,47 +8969,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_local_mem_size_aux4) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_aux4, &device_param->kernel_dynamic_local_mem_size_aux4) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_aux4 = device_param->cuda_warp_size; } } - // GPU memset - - if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_memset, device_param->cuda_module_shared, "gpu_memset") == -1) return -1; - - if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_wgs_memset) == -1) return -1; - - if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1; - - device_param->kernel_preferred_wgs_multiple_memset = device_param->cuda_warp_size; - - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1; - - // GPU autotune init - - if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_atinit, device_param->cuda_module_shared, "gpu_atinit") == -1) return -1; - - if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_wgs_atinit) == -1) return -1; - - if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1; - - device_param->kernel_preferred_wgs_multiple_atinit = device_param->cuda_warp_size; - - // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1; - // CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1; - - // GPU decompress - - if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_decompress, device_param->cuda_module_shared, "gpu_decompress") == -1) return -1; - - if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_wgs_decompress) == -1) return -1; - - if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1; - - device_param->kernel_preferred_wgs_multiple_decompress = device_param->cuda_warp_size; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1; //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1; //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1; @@ -8813,6 +8997,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_local_mem_size_mp_l) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_l, &device_param->kernel_dynamic_local_mem_size_mp_l) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_mp_l = device_param->cuda_warp_size; // mp_r @@ -8823,6 +9009,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_local_mem_size_mp_r) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp_r, &device_param->kernel_dynamic_local_mem_size_mp_r) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_mp_r = device_param->cuda_warp_size; if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) @@ -8839,6 +9027,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size; } else if (user_options->attack_mode == ATTACK_MODE_HYBRID2) @@ -8849,6 +9039,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_mp = device_param->cuda_warp_size; } } @@ -8870,10 +9062,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_local_mem_size_amp) == -1) return -1; + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_amp, &device_param->kernel_dynamic_local_mem_size_amp) == -1) return -1; + device_param->kernel_preferred_wgs_multiple_amp = device_param->cuda_warp_size; } -/* + /* if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) { // nothing to do @@ -8901,7 +9095,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) //if (CL_rc == -1) return -1; } } -*/ + */ } // zero some data buffers @@ -8997,6 +9191,24 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (device_param->is_opencl == true) { + // GPU memset + + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]) == -1) return -1; + + // GPU autotune init + + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]) == -1) return -1; + + // GPU decompress + + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]) == -1) return -1; + char kernel_name[64] = { 0 }; if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) @@ -9015,6 +9227,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_local_mem_size1) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_preferred_wgs_multiple1) == -1) return -1; // kernel2 @@ -9027,6 +9241,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_local_mem_size2) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_preferred_wgs_multiple2) == -1) return -1; // kernel3 @@ -9039,6 +9255,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_local_mem_size3) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_preferred_wgs_multiple3) == -1) return -1; } else @@ -9051,6 +9269,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_local_mem_size4) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_preferred_wgs_multiple4) == -1) return -1; } } @@ -9068,6 +9288,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_local_mem_size1) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_preferred_wgs_multiple1) == -1) return -1; // kernel2 @@ -9080,6 +9302,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_local_mem_size2) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_preferred_wgs_multiple2) == -1) return -1; // kernel3 @@ -9092,6 +9316,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_local_mem_size3) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_preferred_wgs_multiple3) == -1) return -1; } else @@ -9104,6 +9330,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_local_mem_size4) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_dynamic_local_mem_size4) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel4, &device_param->kernel_preferred_wgs_multiple4) == -1) return -1; } } @@ -9125,6 +9353,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_tm, &device_param->kernel_local_mem_size_tm) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_tm, &device_param->kernel_dynamic_local_mem_size_tm) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_tm, &device_param->kernel_preferred_wgs_multiple_tm) == -1) return -1; } } @@ -9142,6 +9372,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_local_mem_size1) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_dynamic_local_mem_size1) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel1, &device_param->kernel_preferred_wgs_multiple1) == -1) return -1; // kernel2 @@ -9154,6 +9386,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_local_mem_size2) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_dynamic_local_mem_size2) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2, &device_param->kernel_preferred_wgs_multiple2) == -1) return -1; // kernel3 @@ -9166,6 +9400,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_local_mem_size3) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_dynamic_local_mem_size3) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel3, &device_param->kernel_preferred_wgs_multiple3) == -1) return -1; // kernel12 @@ -9180,6 +9416,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel12, &device_param->kernel_local_mem_size12) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel12, &device_param->kernel_dynamic_local_mem_size12) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel12, &device_param->kernel_preferred_wgs_multiple12) == -1) return -1; } @@ -9195,6 +9433,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel23, &device_param->kernel_local_mem_size23) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel23, &device_param->kernel_dynamic_local_mem_size23) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel23, &device_param->kernel_preferred_wgs_multiple23) == -1) return -1; } @@ -9210,6 +9450,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_local_mem_size_init2) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_dynamic_local_mem_size_init2) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_preferred_wgs_multiple_init2) == -1) return -1; } @@ -9225,6 +9467,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2, &device_param->kernel_local_mem_size_loop2) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2, &device_param->kernel_dynamic_local_mem_size_loop2) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_loop2, &device_param->kernel_preferred_wgs_multiple_loop2) == -1) return -1; } @@ -9240,6 +9484,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux1, &device_param->kernel_local_mem_size_aux1) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux1, &device_param->kernel_dynamic_local_mem_size_aux1) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux1, &device_param->kernel_preferred_wgs_multiple_aux1) == -1) return -1; } @@ -9255,6 +9501,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux2, &device_param->kernel_local_mem_size_aux2) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux2, &device_param->kernel_dynamic_local_mem_size_aux2) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux2, &device_param->kernel_preferred_wgs_multiple_aux2) == -1) return -1; } @@ -9270,6 +9518,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux3, &device_param->kernel_local_mem_size_aux3) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux3, &device_param->kernel_dynamic_local_mem_size_aux3) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux3, &device_param->kernel_preferred_wgs_multiple_aux3) == -1) return -1; } @@ -9285,52 +9535,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux4, &device_param->kernel_local_mem_size_aux4) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_aux4, &device_param->kernel_dynamic_local_mem_size_aux4) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_aux4, &device_param->kernel_preferred_wgs_multiple_aux4) == -1) return -1; } } - // GPU memset - - if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_memset", &device_param->opencl_kernel_memset) == -1) return -1; - - if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_wgs_memset) == -1) return -1; - - if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_local_mem_size_memset) == -1) return -1; - - if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_preferred_wgs_multiple_memset) == -1) return -1; - - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]) == -1) return -1; - - // GPU autotune init - - if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1; - - if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_wgs_atinit) == -1) return -1; - - if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_local_mem_size_atinit) == -1) return -1; - - if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_preferred_wgs_multiple_atinit) == -1) return -1; - - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]) == -1) return -1; - - // GPU decompress - - if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_decompress", &device_param->opencl_kernel_decompress) == -1) return -1; - - if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_wgs_decompress) == -1) return -1; - - if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_local_mem_size_decompress) == -1) return -1; - - if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress) == -1) return -1; - - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]) == -1) return -1; - // MP start if (user_options->slow_candidates == true) @@ -9348,6 +9558,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp_l, &device_param->kernel_local_mem_size_mp_l) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp_l, &device_param->kernel_dynamic_local_mem_size_mp_l) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp_l, &device_param->kernel_preferred_wgs_multiple_mp_l) == -1) return -1; // mp_r @@ -9358,6 +9570,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp_r, &device_param->kernel_local_mem_size_mp_r) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp_r, &device_param->kernel_dynamic_local_mem_size_mp_r) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp_r, &device_param->kernel_preferred_wgs_multiple_mp_r) == -1) return -1; if (hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) @@ -9374,6 +9588,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_preferred_wgs_multiple_mp) == -1) return -1; } else if (user_options->attack_mode == ATTACK_MODE_HYBRID2) @@ -9384,6 +9600,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_local_mem_size_mp) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_dynamic_local_mem_size_mp) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_mp, &device_param->kernel_preferred_wgs_multiple_mp) == -1) return -1; } } @@ -9405,6 +9623,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_amp, &device_param->kernel_local_mem_size_amp) == -1) return -1; + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_amp, &device_param->kernel_dynamic_local_mem_size_amp) == -1) return -1; + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_amp, &device_param->kernel_preferred_wgs_multiple_amp) == -1) return -1; } diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c index 81c5203de..31c099730 100644 --- a/src/modules/module_03200.c +++ b/src/modules/module_03200.c @@ -91,6 +91,8 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) { fixed_local_size = 1; + + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); } else { @@ -120,15 +122,37 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY { fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; } + + if (device_param->is_cuda == true) + { + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size); + } + else + { + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); + } } else { - fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + if (device_param->is_cuda == true) + { + // using kernel_dynamic_local_mem_size_memset is a bit hackish. + // we had to brute-force this value out of an already loaded CUDA function. + // there's no official way to query for this value. + + fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096; + + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size); + } + else + { + fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); + } } } - hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); - return jit_build_options; }