From 62a5a85dd6565e118191ed68bca854c09068dfcb Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Wed, 25 Jun 2025 11:21:51 +0200 Subject: [PATCH] Added 'next_power_of_two()' and moved both 'next_power_of_two()' and 'previous_power_of_two()' to 'shared.c' Improved autotuner tweak logic and added boundary checks for accel and threads Fixed available host memory detection on Windows Fixed compilation error in MSYS2 native shell Introduced an 8 GiB host memory usage limit per GPU, even if more is available Replaced fixed-size host memory detection per GPU with a dynamic kernel-accel based method (similar to GPU memory detection) Disabled hash-mode autodetection in the python bridge Removed default invocation of 'rocm-smi' in 'benchmark_deep.pl' to avoid skewed initial results Reduced default runtime in 'benchmark_deep.pl' scripts due to improved benchmark accuracy in hashcat in general --- include/shared.h | 3 + src/autotune.c | 164 ++++++++++++------------------------- src/backend.c | 69 +++++++--------- src/modules/module_72000.c | 1 + src/modules/module_73000.c | 1 + src/shared.c | 87 +++++++++++--------- tools/benchmark_deep.pl | 8 +- 7 files changed, 139 insertions(+), 194 deletions(-) diff --git a/include/shared.h b/include/shared.h index 013cb1157..ae968281c 100644 --- a/include/shared.h +++ b/include/shared.h @@ -122,4 +122,7 @@ void restore_stderr (int saved_fd); bool get_free_memory (u64 *free_mem); +u32 previous_power_of_two (const u32 x); +u32 next_power_of_two (const u32 x); + #endif // HC_SHARED_H diff --git a/src/autotune.c b/src/autotune.c index cfde1abed..8030d878a 100644 --- a/src/autotune.c +++ b/src/autotune.c @@ -8,6 +8,7 @@ #include "event.h" #include "backend.h" #include "status.h" +#include "shared.h" #include "autotune.h" int find_tuning_function (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param) @@ -95,46 +96,6 @@ static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi return exec_msec_best; } -static bool is_power_of_2 (const u32 n) -{ - return n != 0 && (n & (n - 1)) == 0; -} - -static u32 previous_power_of_two (const u32 x) -{ - // https://stackoverflow.com/questions/2679815/previous-power-of-2 - // really cool! - - if (x == 0) return 0; - - u32 r = x; - - r |= (r >> 1); - r |= (r >> 2); - r |= (r >> 4); - r |= (r >> 8); - r |= (r >> 16); - - return r - (r >> 1); -} - -static u32 next_power_of_two (const u32 x) -{ - if (x == 0) return 1; - - u32 r = x - 1; - - r |= (r >> 1); - r |= (r >> 2); - r |= (r >> 4); - r |= (r >> 8); - r |= (r >> 16); - - r++; - - return r; -} - static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { const hashconfig_t *hashconfig = hashcat_ctx->hashconfig; @@ -434,87 +395,70 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param } if (kernel_accel > 64) kernel_accel -= kernel_accel % 32; - } - // some final play, if we have strange numbers from the APIs, namely 96, 384, and such - - if (is_power_of_2 (kernel_threads) == false) - { - u32 fun[2]; - - fun[0] = previous_power_of_two (kernel_threads); - fun[1] = next_power_of_two (kernel_threads); - - float fact[2]; - - fact[0] = (float) kernel_threads / fun[0]; - fact[1] = (float) kernel_threads / fun[1]; - - float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2); - - float res[2]; - - for (int i = 0; i < 2; i++) + if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) { - const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel * fact[i], kernel_loops, fun[i], 2); - - res[i] = ms_prev / ms; + if (kernel_accel > device_param->device_processors) kernel_accel -= kernel_accel % device_param->device_processors; } - const int sel = (res[0] > res[1]) ? 0 : 1; + // some final play, if we have strange numbers from the APIs, namely 96, 384, and such - if (res[sel] > 1.01) + if ((kernel_accel_min == kernel_accel_max) || (kernel_threads_min == kernel_threads_max)) { - const u32 kernel_accel_new = kernel_accel * fact[sel]; - const u32 kernel_threads_new = fun[sel]; + } + else + { + u32 fun[2]; - if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max)) + if (is_power_of_2 (kernel_threads) == false) { - // we can't check kernel_threads because that is for sure outside the range - - kernel_accel = kernel_accel_new; - kernel_threads = kernel_threads_new; + fun[0] = previous_power_of_two (kernel_threads); + fun[1] = next_power_of_two (kernel_threads); } - } - } - else - { - // that's also nice - - u32 fun[2]; - - fun[0] = kernel_threads >> 1; - fun[1] = kernel_threads << 1; - - float fact[2]; - - fact[0] = (float) kernel_threads / fun[0]; - fact[1] = (float) kernel_threads / fun[1]; - - float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2); - - float res[2]; - - for (int i = 0; i < 2; i++) - { - const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel * fact[i], kernel_loops, fun[i], 2); - - res[i] = ms_prev / ms; - } - - const int sel = (res[0] > res[1]) ? 0 : 1; - - if (res[sel] > 1.01) - { - const u32 kernel_accel_new = kernel_accel * fact[sel]; - const u32 kernel_threads_new = fun[sel]; - - if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max)) + else { - // we can't check kernel_threads because that is for sure outside the range + fun[0] = kernel_threads >> 1; + fun[1] = kernel_threads << 1; + } - kernel_accel = kernel_accel_new; - kernel_threads = kernel_threads_new; + float fact[2]; + + fact[0] = (float) kernel_threads / fun[0]; + fact[1] = (float) kernel_threads / fun[1]; + + float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2); + + float res[2] = { 0 }; + + for (int i = 0; i < 2; i++) + { + const u32 kernel_threads_test = fun[i]; + const u32 kernel_accel_test = kernel_accel * fact[i]; + + if (kernel_accel_test == 0) continue; + if (kernel_threads_test == 0) continue; + + if (kernel_threads_test > device_param->device_maxworkgroup_size) continue; + + const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel_test, kernel_loops, kernel_threads_test, 2); + + res[i] = ms_prev / ms; + } + + const int sel = (res[0] > res[1]) ? 0 : 1; + + if (res[sel] > 1.01) + { + const u32 kernel_accel_new = kernel_accel * fact[sel]; + const u32 kernel_threads_new = fun[sel]; + + if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max)) + { + // we can't check kernel_threads because that is for sure outside the range + + kernel_accel = kernel_accel_new; + kernel_threads = kernel_threads_new; + } } } } diff --git a/src/backend.c b/src/backend.c index a71bac223..cc0be63ad 100644 --- a/src/backend.c +++ b/src/backend.c @@ -15996,49 +15996,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // check if there's enough host memory left for upcoming allocations, otherwise reduce skip device and present user an option to deal with - u64 accel_limit = 0; + u64 accel_limit_host = 0; - get_free_memory (&accel_limit); - - // in slow candidate mode we need to keep the buffers on the host alive - // a high accel value doesn't help much here anyway - - if (user_options->slow_candidates == true) + if (get_free_memory (&accel_limit_host) == false) { - // Tested with NTLM, almost no difference in performance + const u64 GiB4 = 4ULL * 1024 * 1024 * 1024; - accel_limit /= 8; + event_log_warning (hashcat_ctx, "Couldn't query the OS for free memory, assuming 4GiB"); + + accel_limit_host = GiB4; } - - // this is device_processors * kernel_threads - - accel_limit /= hardware_power_max; - - // single password candidate size - - accel_limit /= sizeof (pw_t); - - // pws[], pws_comp[] and pw_pre[] are some large blocks with password candidates - - accel_limit /= 3; - - // Is possible that the GPU simply has too much hardware resources and 8GB per GPU is not enough, but OTOH we can't get lower than 1 - - accel_limit = MAX (accel_limit, 1); - - // I think vector size is not required because vector_size is dividing the pws_cnt in run_kernel() - - kernel_accel_max = MIN (kernel_accel_max, accel_limit); - - if (kernel_accel_min > kernel_accel_max) + else { - event_log_error (hashcat_ctx, "* Device #%u: Not enough host memory left for this device, skipping...", device_id + 1); - event_log_error (hashcat_ctx, " Retry with lower --kernel-threads value."); + // even tho let's not be greedy - backend_kernel_accel_warnings++; + const u64 GiB8 = 8ULL * 1024 * 1024 * 1024; - device_param->skipped_warning = true; - continue; + accel_limit_host = MIN (accel_limit_host, GiB8); } // Opposite direction check: find out if we would request too much memory on memory blocks which are based on kernel_accel @@ -16184,6 +16158,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (size_kernel_params > undocumented_single_allocation_apple) memory_limit_hit = 1; } + const u64 size_device_extra = (1024 * 1024 * 1024); + const u64 size_total = bitmap_ctx->bitmap_size + bitmap_ctx->bitmap_size @@ -16220,9 +16196,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) + size_st_salts + size_st_esalts + size_kernel_params - + size_spilling; + + size_spilling + + size_device_extra; - if ((size_total + EXTRA_SPACE) > device_param->device_available_mem) memory_limit_hit = 1; + if ((size_total + EXTRA_SPACE) > MIN (device_param->device_available_mem, device_param->device_maxmem_alloc)) memory_limit_hit = 1; if (memory_limit_hit == 1) { @@ -16231,6 +16208,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) continue; } + const u64 size_host_extra = (512 * 1024 * 1024); + const u64 size_total_host = size_pws_comp + size_pws_idx @@ -16240,7 +16219,17 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) + size_brain_link_out #endif + size_pws_pre - + size_pws_base; + + size_pws_base + + size_host_extra; + + if ((size_total_host + EXTRA_SPACE) > accel_limit_host) memory_limit_hit = 1; + + if (memory_limit_hit == 1) + { + kernel_accel_max--; + + continue; + } size_total_host_all += size_total_host; @@ -16249,7 +16238,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (kernel_accel_max < kernel_accel_min) { - event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory for this attack.", device_id + 1); + event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory or free host memory for mapping.", device_id + 1); backend_memory_hit_warnings++; diff --git a/src/modules/module_72000.c b/src/modules/module_72000.c index a1268759e..d9995764c 100644 --- a/src/modules/module_72000.c +++ b/src/modules/module_72000.c @@ -25,6 +25,7 @@ static const u64 KERN_TYPE = 72000; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE | OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_AUTODETECT_DISABLE | OPTS_TYPE_NATIVE_THREADS | OPTS_TYPE_MP_MULTI_DISABLE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; diff --git a/src/modules/module_73000.c b/src/modules/module_73000.c index f8516ce00..351893f26 100644 --- a/src/modules/module_73000.c +++ b/src/modules/module_73000.c @@ -25,6 +25,7 @@ static const u64 KERN_TYPE = 73000; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE | OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_AUTODETECT_DISABLE | OPTS_TYPE_NATIVE_THREADS | OPTS_TYPE_MP_MULTI_DISABLE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; diff --git a/src/shared.c b/src/shared.c index a7d8dca4e..3c4f9c43f 100644 --- a/src/shared.c +++ b/src/shared.c @@ -21,13 +21,12 @@ #endif #if defined (_WIN) -#include #include #endif -#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MSYS__) -#else +#if defined (_POSIX) #include +#include #endif static const char *const PA_000 = "OK"; @@ -1462,13 +1461,13 @@ int generic_salt_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, const u8 * return tmp_len; } -int get_current_arch() +int get_current_arch () { - #if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MSYS__) + #if defined (_WIN) SYSTEM_INFO sysinfo; - GetNativeSystemInfo(&sysinfo); + GetNativeSystemInfo (&sysinfo); switch (sysinfo.wProcessorArchitecture) { @@ -1623,7 +1622,7 @@ void restore_stderr (int saved_fd) bool get_free_memory (u64 *free_mem) { - #if defined(_WIN) || defined(__CYGWIN__) || defined(__MSYS__) + #if defined (_WIN) MEMORYSTATUSEX memStatus; @@ -1640,7 +1639,7 @@ bool get_free_memory (u64 *free_mem) return false; } - #elif defined(__APPLE__) + #elif defined (__APPLE__) mach_port_t host_port = mach_host_self (); @@ -1661,43 +1660,51 @@ bool get_free_memory (u64 *free_mem) return true; - #elif defined(__linux__) + #else - FILE *fp = fopen ("/proc/meminfo", "r"); + struct sysinfo info; - if (fp == NULL) return false; + if (sysinfo (&info) != 0) return false; - char line[256]; - - u64 memFree = 0; - u64 buffers = 0; - u64 cached = 0; - - while (fgets (line, sizeof (line), fp)) - { - if (sscanf (line, "MemFree: %lu kB", &memFree) == 1) - { - continue; - } - else if (sscanf (line, "Buffers: %lu kB", &buffers) == 1) - { - continue; - } - else if (sscanf (line, "Cached: %lu kB", &cached) == 1) - { - continue; - } - } - - fclose (fp); - - *free_mem = (memFree + buffers + cached) * 1024; + *free_mem = (u64) info.freeram * info.mem_unit; return true; - #else - - return false; - #endif } + +u32 previous_power_of_two (const u32 x) +{ + // https://stackoverflow.com/questions/2679815/previous-power-of-2 + // really cool! + + if (x == 0) return 0; + + u32 r = x; + + r |= (r >> 1); + r |= (r >> 2); + r |= (r >> 4); + r |= (r >> 8); + r |= (r >> 16); + + return r - (r >> 1); +} + +u32 next_power_of_two (const u32 x) +{ + if (x == 0) return 1; + + u32 r = x - 1; + + r |= (r >> 1); + r |= (r >> 2); + r |= (r >> 4); + r |= (r >> 8); + r |= (r >> 16); + + r++; + + return r; +} + diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl index b75d371a0..f6369e995 100755 --- a/tools/benchmark_deep.pl +++ b/tools/benchmark_deep.pl @@ -15,8 +15,8 @@ my $kernels_cache = "$hashcat_path/kernels"; my $hashcat_bin = "$hashcat_path/hashcat"; my $device = 1; my $workload_profile = 3; -my $runtime = 24; -my $sleep_sec = 12; +my $runtime = 11; +my $sleep_sec = 13; my $default_mask = "?a?a?a?a?a?a?a"; my $result = "result.txt"; my $old_hashcat = 0; # requires to have ran with new hashcat before to create the hashfiles @@ -34,8 +34,8 @@ if ($cpu_benchmark == 1) } else { - #system ("rocm-smi --resetprofile --resetclocks --resetfans"); - system ("rocm-smi --setfan 100% --setperflevel high"); + #system ("rocm-smi --resetprofile --resetclocks --resetfans"); + #system ("rocm-smi --setfan 100% --setperflevel high"); system ("nvidia-settings -a GPUPowerMizerMode=1 -a GPUFanControlState=1 -a GPUTargetFanSpeed=100"); }