diff --git a/docs/changes.txt b/docs/changes.txt index a13d8c3c9..b6cacc05a 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -61,6 +61,7 @@ - OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space - OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time +- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU - OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache - OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier - OpenCL Runtime: Updated rocm detection diff --git a/src/interface.c b/src/interface.c index f81f9109c..9d422c020 100644 --- a/src/interface.c +++ b/src/interface.c @@ -25980,53 +25980,9 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p if (forced_kernel_threads) return forced_kernel_threads; - // otherwise it depends on (1) the opencl device type + // otherwise it depends on the opencl device type - u32 kernel_threads = 0; - - if (device_param->device_type & CL_DEVICE_TYPE_CPU) - { - kernel_threads = KERNEL_THREADS_MAX_CPU; - } - else if (device_param->device_type & CL_DEVICE_TYPE_GPU) - { - if (device_param->device_vendor_id == VENDOR_ID_NV) - { - kernel_threads = KERNEL_THREADS_MAX_GPU_NV; - } - else if (device_param->device_vendor_id == VENDOR_ID_AMD) - { - kernel_threads = KERNEL_THREADS_MAX_GPU_AMD; - } - else - { - kernel_threads = KERNEL_THREADS_MAX_GPU; - } - } - else - { - kernel_threads = KERNEL_THREADS_MAX_OTHER; - } - - if (user_options->workload_profile == 4) - { - kernel_threads = (u32) device_param->device_maxworkgroup_size; - } - - // and (2) an opencl device can force an lower value (limited resources on device) - - kernel_threads = MIN (kernel_threads, (u32) device_param->device_maxworkgroup_size); - - // and (3) if an OpenCL device allows a very high thread count (for example 1024 on nvidia), - // the host memory required is 32 times as high with 32 (It jumps from 128MB to 4GB device memory requirement). - // since there's no device with that much device memory (because of 1/4 memory rule) it has to limit the - // kernel_accel_max to be a very low number because the pws buffer will be so large otherwise. - // therefore autotune will be unable to calculate a good kernel_accel multiplier. - // currently there's no OpenCL device known that needs result in a better performance with 1024 threads compared to 256. - // as a result, we limit the number of threads to 64, which turns out to be a general good value. - // there's a 1.00% - 2.75% performance drop at NV caused by this, and 0.00% - 1.02% at AMD. - - kernel_threads = MIN (kernel_threads, 64); + const u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size; return kernel_threads; } diff --git a/src/opencl.c b/src/opencl.c index 7119760cb..f01d1d12f 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -4078,6 +4078,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_loops_max = user_options->kernel_loops; } + // limit scrypt accel otherwise we hurt ourself when calculating the scrypt tmto + + if ((hashconfig->hash_mode == 8900) || (hashconfig->hash_mode == 9300) || (hashconfig->hash_mode == 15700)) + { + // 16 is actually a bit low, we may need to change this depending on user response + + device_param->kernel_accel_max = MIN (device_param->kernel_accel_max, 16); + } + // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) @@ -4356,6 +4365,12 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) size_t size_tmps = 4; size_t size_hooks = 4; + // instead of adding a thread limit we can also use a memory limit. + // this value should represent a reasonable amount of memory a host system has per GPU. + // note we're allocating 3 blocks of that size. + + #define PWS_SPACE (512 * 1024 * 1024) + // sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate. // let's add some extra space just to be sure. @@ -4392,6 +4407,11 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) int memory_limit_hit = 0; + if (user_options->workload_profile < 4) + { + if (size_pws > PWS_SPACE) memory_limit_hit = 1; + } + if ((size_pws + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; if ((size_tmps + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;