1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-12-23 15:18:16 +00:00

OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU

This commit is contained in:
Jens Steube 2018-02-09 16:42:28 +01:00
parent 088c45040b
commit b0ff625844
3 changed files with 23 additions and 46 deletions

View File

@ -61,6 +61,7 @@
- OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices - OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
- OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time - OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
- OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache - OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
- OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier - OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
- OpenCL Runtime: Updated rocm detection - OpenCL Runtime: Updated rocm detection

View File

@ -25980,53 +25980,9 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
if (forced_kernel_threads) return forced_kernel_threads; if (forced_kernel_threads) return forced_kernel_threads;
// otherwise it depends on (1) the opencl device type // otherwise it depends on the opencl device type
u32 kernel_threads = 0; const u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
{
kernel_threads = KERNEL_THREADS_MAX_CPU;
}
else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
{
if (device_param->device_vendor_id == VENDOR_ID_NV)
{
kernel_threads = KERNEL_THREADS_MAX_GPU_NV;
}
else if (device_param->device_vendor_id == VENDOR_ID_AMD)
{
kernel_threads = KERNEL_THREADS_MAX_GPU_AMD;
}
else
{
kernel_threads = KERNEL_THREADS_MAX_GPU;
}
}
else
{
kernel_threads = KERNEL_THREADS_MAX_OTHER;
}
if (user_options->workload_profile == 4)
{
kernel_threads = (u32) device_param->device_maxworkgroup_size;
}
// and (2) an opencl device can force an lower value (limited resources on device)
kernel_threads = MIN (kernel_threads, (u32) device_param->device_maxworkgroup_size);
// and (3) if an OpenCL device allows a very high thread count (for example 1024 on nvidia),
// the host memory required is 32 times as high with 32 (It jumps from 128MB to 4GB device memory requirement).
// since there's no device with that much device memory (because of 1/4 memory rule) it has to limit the
// kernel_accel_max to be a very low number because the pws buffer will be so large otherwise.
// therefore autotune will be unable to calculate a good kernel_accel multiplier.
// currently there's no OpenCL device known that needs result in a better performance with 1024 threads compared to 256.
// as a result, we limit the number of threads to 64, which turns out to be a general good value.
// there's a 1.00% - 2.75% performance drop at NV caused by this, and 0.00% - 1.02% at AMD.
kernel_threads = MIN (kernel_threads, 64);
return kernel_threads; return kernel_threads;
} }

View File

@ -4078,6 +4078,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
device_param->kernel_loops_max = user_options->kernel_loops; device_param->kernel_loops_max = user_options->kernel_loops;
} }
// limit scrypt accel otherwise we hurt ourself when calculating the scrypt tmto
if ((hashconfig->hash_mode == 8900) || (hashconfig->hash_mode == 9300) || (hashconfig->hash_mode == 15700))
{
// 16 is actually a bit low, we may need to change this depending on user response
device_param->kernel_accel_max = MIN (device_param->kernel_accel_max, 16);
}
// we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep // we have some absolute limits for fast hashes (because of limit constant memory), make sure not to overstep
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@ -4356,6 +4365,12 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
size_t size_tmps = 4; size_t size_tmps = 4;
size_t size_hooks = 4; size_t size_hooks = 4;
// instead of adding a thread limit we can also use a memory limit.
// this value should represent a reasonable amount of memory a host system has per GPU.
// note we're allocating 3 blocks of that size.
#define PWS_SPACE (512 * 1024 * 1024)
// sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate. // sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
// let's add some extra space just to be sure. // let's add some extra space just to be sure.
@ -4392,6 +4407,11 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
int memory_limit_hit = 0; int memory_limit_hit = 0;
if (user_options->workload_profile < 4)
{
if (size_pws > PWS_SPACE) memory_limit_hit = 1;
}
if ((size_pws + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; if ((size_pws + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
if ((size_tmps + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; if ((size_tmps + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;