mirror of
https://github.com/hashcat/hashcat.git
synced 2024-11-29 11:28:15 +00:00
OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count
This commit is contained in:
parent
a7a15df911
commit
d656e9c3a4
@ -61724,7 +61724,8 @@ __kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max)
|
||||
pw.i[62] = 0;
|
||||
pw.i[63] = 0; // yep that's faster
|
||||
|
||||
pw.pw_len = 1 + (l32 & 15);
|
||||
//pw.pw_len = 1 + (l32 & 15);
|
||||
pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500)
|
||||
|
||||
buf[gid] = pw;
|
||||
}
|
||||
|
@ -63,6 +63,7 @@
|
||||
- OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
|
||||
- OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
|
||||
- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
|
||||
- OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count
|
||||
- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
|
||||
- OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
|
||||
- OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
|
||||
|
@ -61,4 +61,7 @@ bool hc_same_files (char *file1, char *file2);
|
||||
u32 hc_strtoul (const char *nptr, char **endptr, int base);
|
||||
u64 hc_strtoull (const char *nptr, char **endptr, int base);
|
||||
|
||||
u32 power_of_two_ceil_32 (const u32 v);
|
||||
u32 power_of_two_floor_32 (const u32 v);
|
||||
|
||||
#endif // _SHARED_H
|
||||
|
@ -138,14 +138,19 @@ typedef enum amplifier_count
|
||||
KERNEL_BFS = 1024,
|
||||
KERNEL_COMBS = 1024,
|
||||
KERNEL_RULES = 256,
|
||||
KERNEL_THREADS_MAX_CPU = 1,
|
||||
KERNEL_THREADS_MAX_GPU = 8, // ex: intel integrated
|
||||
KERNEL_THREADS_MAX_GPU_NV = 32, // optimized NV size: warps
|
||||
KERNEL_THREADS_MAX_GPU_AMD = 64, // optimized AMD size: wavefronts
|
||||
KERNEL_THREADS_MAX_OTHER = 8, // ex: intel MIC
|
||||
|
||||
} amplifier_count_t;
|
||||
|
||||
typedef enum native_threads
|
||||
{
|
||||
KERNEL_THREADS_NATIVE_CPU = 1,
|
||||
KERNEL_THREADS_NATIVE_GPU = 8, // ex: intel integrated
|
||||
KERNEL_THREADS_NATIVE_GPU_NV = 32, // optimized NV size: warps
|
||||
KERNEL_THREADS_NATIVE_GPU_AMD = 64, // optimized AMD size: wavefronts
|
||||
KERNEL_THREADS_NATIVE_OTHER = 8, // ex: intel MIC
|
||||
|
||||
} native_threads_t;
|
||||
|
||||
typedef enum vendor_id
|
||||
{
|
||||
VENDOR_ID_AMD = (1 << 0),
|
||||
@ -923,6 +928,7 @@ typedef struct hc_device_param
|
||||
u64 device_global_mem;
|
||||
u32 device_maxclock_frequency;
|
||||
size_t device_maxworkgroup_size;
|
||||
u64 device_local_mem_size;
|
||||
|
||||
u32 vector_width;
|
||||
|
||||
|
@ -351,11 +351,11 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
|
||||
return -1;
|
||||
}
|
||||
|
||||
// we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
|
||||
// but sometimes, when a benchmark kernel run cracks a kernel, we don't want to see that!
|
||||
|
||||
if (user_options->speed_only == true)
|
||||
{
|
||||
// we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
|
||||
// however if the benchmark cracks the artificial hash used for benchmarks we don't want to see that!
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -25970,6 +25970,66 @@ u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx)
|
||||
return kernel_threads;
|
||||
}
|
||||
|
||||
u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
|
||||
{
|
||||
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
|
||||
|
||||
u32 kernel_threads = 0;
|
||||
|
||||
// sometimes there's a high kernel requirement for local memory (which is multiplied with threads)
|
||||
|
||||
u32 local_mem_per_thread = 0;
|
||||
|
||||
// basically the sum of all .local space of the _loop kernel
|
||||
// see .ptx
|
||||
|
||||
if (hashconfig->hash_mode == 1800) local_mem_per_thread = 1024;
|
||||
if (hashconfig->hash_mode == 12500) local_mem_per_thread = 3296;
|
||||
if (hashconfig->hash_mode == 13400) local_mem_per_thread = 5360;
|
||||
|
||||
if (local_mem_per_thread)
|
||||
{
|
||||
const u32 device_local_mem_size = (const u32) device_param->device_local_mem_size;
|
||||
|
||||
kernel_threads = device_local_mem_size / local_mem_per_thread;
|
||||
|
||||
// there can be some very unaligned results from this, therefore round it down to next power of two
|
||||
|
||||
kernel_threads = power_of_two_floor_32 (kernel_threads);
|
||||
}
|
||||
|
||||
// make sure to not underpower
|
||||
|
||||
if (kernel_threads)
|
||||
{
|
||||
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
|
||||
}
|
||||
else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
|
||||
{
|
||||
if (device_param->device_vendor_id == VENDOR_ID_NV)
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_NV);
|
||||
}
|
||||
else if (device_param->device_vendor_id == VENDOR_ID_AMD)
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_AMD);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_OTHER);
|
||||
}
|
||||
}
|
||||
|
||||
return kernel_threads;
|
||||
}
|
||||
|
||||
u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
|
||||
{
|
||||
const user_options_t *user_options = hashcat_ctx->user_options;
|
||||
@ -25980,13 +26040,22 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
|
||||
|
||||
if (forced_kernel_threads) return forced_kernel_threads;
|
||||
|
||||
// otherwise it depends on the opencl device type
|
||||
// it can also depends on the opencl device type
|
||||
|
||||
u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
|
||||
u32 kernel_threads = (u32) device_param->device_maxworkgroup_size;
|
||||
|
||||
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
|
||||
{
|
||||
kernel_threads = MIN (kernel_threads, KERNEL_THREADS_MAX_CPU);
|
||||
kernel_threads = MIN (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
|
||||
}
|
||||
|
||||
// or if it requires for example a lot of local memory
|
||||
|
||||
const u32 limited_kernel_threads = hashconfig_limited_kernel_threads (hashcat_ctx, device_param);
|
||||
|
||||
if (limited_kernel_threads)
|
||||
{
|
||||
kernel_threads = MIN (kernel_threads, limited_kernel_threads);
|
||||
}
|
||||
|
||||
return kernel_threads;
|
||||
@ -25994,7 +26063,7 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
|
||||
|
||||
u32 hashconfig_get_kernel_loops (hashcat_ctx_t *hashcat_ctx)
|
||||
{
|
||||
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
|
||||
const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
|
||||
const user_options_t *user_options = hashcat_ctx->user_options;
|
||||
|
||||
u32 kernel_loops_fixed = 0;
|
||||
|
@ -3306,6 +3306,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
||||
device_param->skipped = true;
|
||||
}
|
||||
|
||||
device_param->device_local_mem_size = device_local_mem_size;
|
||||
|
||||
// If there's both an Intel CPU and an AMD OpenCL runtime it's a tricky situation
|
||||
// Both platforms support CPU device types and therefore both will try to use 100% of the physical resources
|
||||
// This results in both utilizing it for 50%
|
||||
|
29
src/shared.c
29
src/shared.c
@ -506,3 +506,32 @@ u64 hc_strtoull (const char *nptr, char **endptr, int base)
|
||||
{
|
||||
return (u64) strtoull (nptr, endptr, base);
|
||||
}
|
||||
|
||||
u32 power_of_two_ceil_32 (const u32 v)
|
||||
{
|
||||
u32 r = v;
|
||||
|
||||
r--;
|
||||
|
||||
r |= r >> 1;
|
||||
r |= r >> 2;
|
||||
r |= r >> 4;
|
||||
r |= r >> 8;
|
||||
r |= r >> 16;
|
||||
|
||||
r++;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
u32 power_of_two_floor_32 (const u32 v)
|
||||
{
|
||||
u32 r = power_of_two_ceil_32 (v);
|
||||
|
||||
if (r > v)
|
||||
{
|
||||
r >>= 1;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
10
src/status.c
10
src/status.c
@ -1389,15 +1389,17 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d
|
||||
|
||||
if (device_param->skipped == false)
|
||||
{
|
||||
for (int i = 0; i < SPEED_CACHE; i++)
|
||||
const u32 speed_pos = device_param->speed_pos;
|
||||
|
||||
for (int i = 0; i < speed_pos; i++)
|
||||
{
|
||||
speed_cnt += device_param->speed_cnt[i];
|
||||
speed_msec += device_param->speed_msec[i];
|
||||
}
|
||||
}
|
||||
|
||||
speed_cnt /= SPEED_CACHE;
|
||||
speed_msec /= SPEED_CACHE;
|
||||
speed_cnt /= speed_pos;
|
||||
speed_msec /= speed_pos;
|
||||
}
|
||||
|
||||
double hashes_dev_msec = 0;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user