mirror of
https://github.com/hashcat/hashcat.git
synced 2024-12-23 15:18:16 +00:00
OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
This commit is contained in:
parent
088c45040b
commit
b0ff625844
@ -61,6 +61,7 @@
|
|||||||
- OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
|
- OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
|
||||||
- OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
|
- OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
|
||||||
- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
|
- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
|
||||||
|
- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
|
||||||
- OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
|
- OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
|
||||||
- OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
|
- OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
|
||||||
- OpenCL Runtime: Updated rocm detection
|
- OpenCL Runtime: Updated rocm detection
|
||||||
|
@ -25980,53 +25980,9 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
|
|||||||
|
|
||||||
if (forced_kernel_threads) return forced_kernel_threads;
|
if (forced_kernel_threads) return forced_kernel_threads;
|
||||||
|
|
||||||
// otherwise it depends on (1) the opencl device type
|
// otherwise it depends on the opencl device type
|
||||||
|
|
||||||
u32 kernel_threads = 0;
|
const u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
|
||||||
|
|
||||||
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
|
|
||||||
{
|
|
||||||
kernel_threads = KERNEL_THREADS_MAX_CPU;
|
|
||||||
}
|
|
||||||
else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
|
|
||||||
{
|
|
||||||
if (device_param->device_vendor_id == VENDOR_ID_NV)
|
|
||||||
{
|
|
||||||
kernel_threads = KERNEL_THREADS_MAX_GPU_NV;
|
|
||||||
}
|
|
||||||
else if (device_param->device_vendor_id == VENDOR_ID_AMD)
|
|
||||||
{
|
|
||||||
kernel_threads = KERNEL_THREADS_MAX_GPU_AMD;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
kernel_threads = KERNEL_THREADS_MAX_GPU;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
kernel_threads = KERNEL_THREADS_MAX_OTHER;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (user_options->workload_profile == 4)
|
|
||||||
{
|
|
||||||
kernel_threads = (u32) device_param->device_maxworkgroup_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
// and (2) an opencl device can force an lower value (limited resources on device)
|
|
||||||
|
|
||||||
kernel_threads = MIN (kernel_threads, (u32) device_param->device_maxworkgroup_size);
|
|
||||||
|
|
||||||
// and (3) if an OpenCL device allows a very high thread count (for example 1024 on nvidia),
|
|
||||||
// the host memory required is 32 times as high with 32 (It jumps from 128MB to 4GB device memory requirement).
|
|
||||||
// since there's no device with that much device memory (because of 1/4 memory rule) it has to limit the
|
|
||||||
// kernel_accel_max to be a very low number because the pws buffer will be so large otherwise.
|
|
||||||
// therefore autotune will be unable to calculate a good kernel_accel multiplier.
|
|
||||||
// currently there's no OpenCL device known that needs result in a better performance with 1024 threads compared to 256.
|
|
||||||
// as a result, we limit the number of threads to 64, which turns out to be a general good value.
|
|
||||||
// there's a 1.00% - 2.75% performance drop at NV caused by this, and 0.00% - 1.02% at AMD.
|
|
||||||
|
|
||||||
kernel_threads = MIN (kernel_threads, 64);
|
|
||||||
|
|
||||||
return kernel_threads;
|
return kernel_threads;
|
||||||
}
|
}
|
||||||
|
20
src/opencl.c
20
src/opencl.c
@ -4078,6 +4078,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
|
|||||||
device_param->kernel_loops_max = user_options->kernel_loops;
|
device_param->kernel_loops_max = user_options->kernel_loops;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// limit scrypt accel otherwise we hurt ourself when calculating the scrypt tmto
|
||||||
|
|
||||||
|
if ((hashconfig->hash_mode == 8900) || (hashconfig->hash_mode == 9300) || (hashconfig->hash_mode == 15700))
|
||||||
|
{
|
||||||
|
// 16 is actually a bit low, we may need to change this depending on user response
|
||||||
|
|
||||||
|
device_param->kernel_accel_max = MIN (device_param->kernel_accel_max, 16);
|
||||||
|
}
|
||||||
|
|
||||||
// we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
|
// we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
|
||||||
|
|
||||||
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
|
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
|
||||||
@ -4356,6 +4365,12 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
|
|||||||
size_t size_tmps = 4;
|
size_t size_tmps = 4;
|
||||||
size_t size_hooks = 4;
|
size_t size_hooks = 4;
|
||||||
|
|
||||||
|
// instead of adding a thread limit we can also use a memory limit.
|
||||||
|
// this value should represent a reasonable amount of memory a host system has per GPU.
|
||||||
|
// note we're allocating 3 blocks of that size.
|
||||||
|
|
||||||
|
#define PWS_SPACE (512 * 1024 * 1024)
|
||||||
|
|
||||||
// sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
|
// sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
|
||||||
// let's add some extra space just to be sure.
|
// let's add some extra space just to be sure.
|
||||||
|
|
||||||
@ -4392,6 +4407,11 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
|
|||||||
|
|
||||||
int memory_limit_hit = 0;
|
int memory_limit_hit = 0;
|
||||||
|
|
||||||
|
if (user_options->workload_profile < 4)
|
||||||
|
{
|
||||||
|
if (size_pws > PWS_SPACE) memory_limit_hit = 1;
|
||||||
|
}
|
||||||
|
|
||||||
if ((size_pws + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
|
if ((size_pws + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
|
||||||
if ((size_tmps + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
|
if ((size_tmps + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
|
||||||
if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
|
if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
|
||||||
|
Loading…
Reference in New Issue
Block a user