1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-07-23 15:08:37 +00:00

Improved handling in get_opencl_kernel_wgs()

There are cases where we fix the thread count in a kernel using
FIXED_LOCAL_SIZE, but when the runtime loads the kernel binary, it
reports that it can only execute it with a different thread count.
According to the OpenCL specification, this can happen due to register
pressure.

However, we fix the thread count for a specific reason, and we choose to
accept potential register spilling to global memory. A warning is now
issued to inform the user about the runtime's suggested thread count,
allowing them to override it via the command line if they encounter
issues.

Also fixed the thread count for -m 10700 on NVIDIA's OpenCL, where 4
bytes are always lost for an unknown reason (similar to the issue seen
in bcrypt).
This commit is contained in:
Jens Steube 2025-07-04 21:51:32 +02:00
parent 7ec73877fa
commit d3983edaf2
2 changed files with 39 additions and 23 deletions

View File

@ -9336,7 +9336,14 @@ static int get_opencl_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t
if (cwgs_total > 0)
{
kernel_threads = MIN (kernel_threads, (u32) cwgs_total);
if (kernel_threads < cwgs_total)
{
// Very likely some bug, because the runtime was unable to follow our requirement to run N threads guaranteed on this kernel
event_log_warning (hashcat_ctx, "* Device #%u: Runtime returned CL_KERNEL_WORK_GROUP_SIZE=%d, but CL_KERNEL_COMPILE_WORK_GROUP_SIZE=%d. Use -T%d if you run into problems.", device_param->device_id + 1, (int) kernel_threads, (int) cwgs_total, (int) kernel_threads);
}
kernel_threads = cwgs_total;
}
*result = kernel_threads;

View File

@ -152,38 +152,47 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
{
char *jit_build_options = NULL;
const u32 shared_size_scratch = (32 + 64 + 16); // LOCAL_VK u32 s_sc[FIXED_LOCAL_SIZE][PWMAXSZ4 + BLMAXSZ4 + AESSZ4];
const u32 shared_size_aes = (5 * 1024); // LOCAL_VK u32 s_te0[256];
if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
{
u32 native_threads = 0;
char *jit_build_options = NULL;
if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
{
native_threads = 1;
}
else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
{
#if defined (__APPLE__)
native_threads = 32;
#else
if (device_param->device_local_mem_size < 49152)
{
native_threads = MIN (device_param->kernel_preferred_wgs_multiple, 32); // We can't just set 32, because Intel GPU need 8
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", 1);
}
else
{
// to go over 48KiB, we need to use dynamic shared mem
native_threads = 49152 / 128;
u32 overhead = 0;
if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
{
// note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
// Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
// on my development system. no clue where the 4 bytes are spent.
// I did some research on this and it seems to be related with the datatype.
// For example, if i used u8 instead, there's only 1 byte wasted.
if (device_param->is_opencl == true)
{
overhead = 1;
}
}
#endif
const u32 device_local_mem_size = MIN (device_param->device_local_mem_size, 48*1024);
u32 fixed_local_size = ((device_local_mem_size - overhead) - shared_size_aes) / shared_size_scratch;
if (user_options->kernel_threads_chgd == true)
{
fixed_local_size = user_options->kernel_threads;
}
else
{
if (fixed_local_size > device_param->kernel_preferred_wgs_multiple) fixed_local_size -= fixed_local_size % device_param->kernel_preferred_wgs_multiple;
}
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", native_threads);
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", fixed_local_size);
}
return jit_build_options;