diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index d3a1622a0..8034c4e0e 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -61724,7 +61724,8 @@ __kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max) pw.i[62] = 0; pw.i[63] = 0; // yep that's faster - pw.pw_len = 1 + (l32 & 15); + //pw.pw_len = 1 + (l32 & 15); + pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500) buf[gid] = pw; } diff --git a/docs/changes.txt b/docs/changes.txt index cac918b76..2388de6d0 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -63,6 +63,7 @@ - OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space - OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time +- OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count - OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU - OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache - OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier diff --git a/include/shared.h b/include/shared.h index e5b74b621..0f8e38eef 100644 --- a/include/shared.h +++ b/include/shared.h @@ -61,4 +61,7 @@ bool hc_same_files (char *file1, char *file2); u32 hc_strtoul (const char *nptr, char **endptr, int base); u64 hc_strtoull (const char *nptr, char **endptr, int base); +u32 power_of_two_ceil_32 (const u32 v); +u32 power_of_two_floor_32 (const u32 v); + #endif // _SHARED_H diff --git a/include/types.h b/include/types.h index f51604fe4..32baedd1d 100644 --- a/include/types.h +++ b/include/types.h @@ -138,14 +138,19 @@ typedef enum amplifier_count KERNEL_BFS = 1024, KERNEL_COMBS = 1024, KERNEL_RULES = 256, - KERNEL_THREADS_MAX_CPU = 1, - KERNEL_THREADS_MAX_GPU = 8, // ex: intel integrated - KERNEL_THREADS_MAX_GPU_NV = 32, // optimized NV size: warps - KERNEL_THREADS_MAX_GPU_AMD = 64, // optimized AMD size: wavefronts - KERNEL_THREADS_MAX_OTHER = 8, // ex: intel MIC } amplifier_count_t; +typedef enum native_threads +{ + KERNEL_THREADS_NATIVE_CPU = 1, + KERNEL_THREADS_NATIVE_GPU = 8, // ex: intel integrated + KERNEL_THREADS_NATIVE_GPU_NV = 32, // optimized NV size: warps + KERNEL_THREADS_NATIVE_GPU_AMD = 64, // optimized AMD size: wavefronts + KERNEL_THREADS_NATIVE_OTHER = 8, // ex: intel MIC + +} native_threads_t; + typedef enum vendor_id { VENDOR_ID_AMD = (1 << 0), @@ -923,6 +928,7 @@ typedef struct hc_device_param u64 device_global_mem; u32 device_maxclock_frequency; size_t device_maxworkgroup_size; + u64 device_local_mem_size; u32 vector_width; diff --git a/src/hashes.c b/src/hashes.c index b50a25e72..b5f1e2cde 100644 --- a/src/hashes.c +++ b/src/hashes.c @@ -351,11 +351,11 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, return -1; } - // we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance - // but sometimes, when a benchmark kernel run cracks a kernel, we don't want to see that! - if (user_options->speed_only == true) { + // we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance + // however if the benchmark cracks the artificial hash used for benchmarks we don't want to see that! + return 0; } diff --git a/src/interface.c b/src/interface.c index 8fba39748..d49ae2c72 100644 --- a/src/interface.c +++ b/src/interface.c @@ -25970,6 +25970,66 @@ u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx) return kernel_threads; } +u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param) +{ + hashconfig_t *hashconfig = hashcat_ctx->hashconfig; + + u32 kernel_threads = 0; + + // sometimes there's a high kernel requirement for local memory (which is multiplied with threads) + + u32 local_mem_per_thread = 0; + + // basically the sum of all .local space of the _loop kernel + // see .ptx + + if (hashconfig->hash_mode == 1800) local_mem_per_thread = 1024; + if (hashconfig->hash_mode == 12500) local_mem_per_thread = 3296; + if (hashconfig->hash_mode == 13400) local_mem_per_thread = 5360; + + if (local_mem_per_thread) + { + const u32 device_local_mem_size = (const u32) device_param->device_local_mem_size; + + kernel_threads = device_local_mem_size / local_mem_per_thread; + + // there can be some very unaligned results from this, therefore round it down to next power of two + + kernel_threads = power_of_two_floor_32 (kernel_threads); + } + + // make sure to not underpower + + if (kernel_threads) + { + if (device_param->device_type & CL_DEVICE_TYPE_CPU) + { + kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_CPU); + } + else if (device_param->device_type & CL_DEVICE_TYPE_GPU) + { + if (device_param->device_vendor_id == VENDOR_ID_NV) + { + kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_NV); + } + else if (device_param->device_vendor_id == VENDOR_ID_AMD) + { + kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_AMD); + } + else + { + kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU); + } + } + else + { + kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_OTHER); + } + } + + return kernel_threads; +} + u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param) { const user_options_t *user_options = hashcat_ctx->user_options; @@ -25980,13 +26040,22 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p if (forced_kernel_threads) return forced_kernel_threads; - // otherwise it depends on the opencl device type + // it can also depends on the opencl device type - u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size; + u32 kernel_threads = (u32) device_param->device_maxworkgroup_size; if (device_param->device_type & CL_DEVICE_TYPE_CPU) { - kernel_threads = MIN (kernel_threads, KERNEL_THREADS_MAX_CPU); + kernel_threads = MIN (kernel_threads, KERNEL_THREADS_NATIVE_CPU); + } + + // or if it requires for example a lot of local memory + + const u32 limited_kernel_threads = hashconfig_limited_kernel_threads (hashcat_ctx, device_param); + + if (limited_kernel_threads) + { + kernel_threads = MIN (kernel_threads, limited_kernel_threads); } return kernel_threads; @@ -25994,7 +26063,7 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p u32 hashconfig_get_kernel_loops (hashcat_ctx_t *hashcat_ctx) { - hashconfig_t *hashconfig = hashcat_ctx->hashconfig; + const hashconfig_t *hashconfig = hashcat_ctx->hashconfig; const user_options_t *user_options = hashcat_ctx->user_options; u32 kernel_loops_fixed = 0; diff --git a/src/opencl.c b/src/opencl.c index ca5960e7e..f70b31694 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -3306,6 +3306,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->skipped = true; } + device_param->device_local_mem_size = device_local_mem_size; + // If there's both an Intel CPU and an AMD OpenCL runtime it's a tricky situation // Both platforms support CPU device types and therefore both will try to use 100% of the physical resources // This results in both utilizing it for 50% diff --git a/src/shared.c b/src/shared.c index 7b0e957a9..ece503589 100644 --- a/src/shared.c +++ b/src/shared.c @@ -506,3 +506,32 @@ u64 hc_strtoull (const char *nptr, char **endptr, int base) { return (u64) strtoull (nptr, endptr, base); } + +u32 power_of_two_ceil_32 (const u32 v) +{ + u32 r = v; + + r--; + + r |= r >> 1; + r |= r >> 2; + r |= r >> 4; + r |= r >> 8; + r |= r >> 16; + + r++; + + return r; +} + +u32 power_of_two_floor_32 (const u32 v) +{ + u32 r = power_of_two_ceil_32 (v); + + if (r > v) + { + r >>= 1; + } + + return r; +} diff --git a/src/status.c b/src/status.c index 0a8ff502b..6a6423d4a 100644 --- a/src/status.c +++ b/src/status.c @@ -1389,15 +1389,17 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d if (device_param->skipped == false) { - for (int i = 0; i < SPEED_CACHE; i++) + const u32 speed_pos = device_param->speed_pos; + + for (int i = 0; i < speed_pos; i++) { speed_cnt += device_param->speed_cnt[i]; speed_msec += device_param->speed_msec[i]; } - } - speed_cnt /= SPEED_CACHE; - speed_msec /= SPEED_CACHE; + speed_cnt /= speed_pos; + speed_msec /= speed_pos; + } double hashes_dev_msec = 0;