1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-07-25 16:08:39 +00:00

Added 'next_power_of_two()' and moved both 'next_power_of_two()' and 'previous_power_of_two()' to 'shared.c'

Improved autotuner tweak logic and added boundary checks for accel and threads
Fixed available host memory detection on Windows
Fixed compilation error in MSYS2 native shell
Introduced an 8 GiB host memory usage limit per GPU, even if more is available
Replaced fixed-size host memory detection per GPU with a dynamic kernel-accel based method (similar to GPU memory detection)
Disabled hash-mode autodetection in the python bridge
Removed default invocation of 'rocm-smi' in 'benchmark_deep.pl' to avoid skewed initial results
Reduced default runtime in 'benchmark_deep.pl' scripts due to improved benchmark accuracy in hashcat in general
This commit is contained in:
Jens Steube 2025-06-25 11:21:51 +02:00
parent 669bd619cd
commit 62a5a85dd6
7 changed files with 139 additions and 194 deletions

View File

@ -122,4 +122,7 @@ void restore_stderr (int saved_fd);
bool get_free_memory (u64 *free_mem); bool get_free_memory (u64 *free_mem);
u32 previous_power_of_two (const u32 x);
u32 next_power_of_two (const u32 x);
#endif // HC_SHARED_H #endif // HC_SHARED_H

View File

@ -8,6 +8,7 @@
#include "event.h" #include "event.h"
#include "backend.h" #include "backend.h"
#include "status.h" #include "status.h"
#include "shared.h"
#include "autotune.h" #include "autotune.h"
int find_tuning_function (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param) int find_tuning_function (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param)
@ -95,46 +96,6 @@ static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
return exec_msec_best; return exec_msec_best;
} }
static bool is_power_of_2 (const u32 n)
{
return n != 0 && (n & (n - 1)) == 0;
}
static u32 previous_power_of_two (const u32 x)
{
// https://stackoverflow.com/questions/2679815/previous-power-of-2
// really cool!
if (x == 0) return 0;
u32 r = x;
r |= (r >> 1);
r |= (r >> 2);
r |= (r >> 4);
r |= (r >> 8);
r |= (r >> 16);
return r - (r >> 1);
}
static u32 next_power_of_two (const u32 x)
{
if (x == 0) return 1;
u32 r = x - 1;
r |= (r >> 1);
r |= (r >> 2);
r |= (r >> 4);
r |= (r >> 8);
r |= (r >> 16);
r++;
return r;
}
static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
{ {
const hashconfig_t *hashconfig = hashcat_ctx->hashconfig; const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
@ -434,87 +395,70 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
} }
if (kernel_accel > 64) kernel_accel -= kernel_accel % 32; if (kernel_accel > 64) kernel_accel -= kernel_accel % 32;
}
// some final play, if we have strange numbers from the APIs, namely 96, 384, and such if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
if (is_power_of_2 (kernel_threads) == false)
{
u32 fun[2];
fun[0] = previous_power_of_two (kernel_threads);
fun[1] = next_power_of_two (kernel_threads);
float fact[2];
fact[0] = (float) kernel_threads / fun[0];
fact[1] = (float) kernel_threads / fun[1];
float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
float res[2];
for (int i = 0; i < 2; i++)
{ {
const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel * fact[i], kernel_loops, fun[i], 2); if (kernel_accel > device_param->device_processors) kernel_accel -= kernel_accel % device_param->device_processors;
res[i] = ms_prev / ms;
} }
const int sel = (res[0] > res[1]) ? 0 : 1; // some final play, if we have strange numbers from the APIs, namely 96, 384, and such
if (res[sel] > 1.01) if ((kernel_accel_min == kernel_accel_max) || (kernel_threads_min == kernel_threads_max))
{ {
const u32 kernel_accel_new = kernel_accel * fact[sel]; }
const u32 kernel_threads_new = fun[sel]; else
{
u32 fun[2];
if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max)) if (is_power_of_2 (kernel_threads) == false)
{ {
// we can't check kernel_threads because that is for sure outside the range fun[0] = previous_power_of_two (kernel_threads);
fun[1] = next_power_of_two (kernel_threads);
kernel_accel = kernel_accel_new;
kernel_threads = kernel_threads_new;
} }
} else
}
else
{
// that's also nice
u32 fun[2];
fun[0] = kernel_threads >> 1;
fun[1] = kernel_threads << 1;
float fact[2];
fact[0] = (float) kernel_threads / fun[0];
fact[1] = (float) kernel_threads / fun[1];
float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
float res[2];
for (int i = 0; i < 2; i++)
{
const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel * fact[i], kernel_loops, fun[i], 2);
res[i] = ms_prev / ms;
}
const int sel = (res[0] > res[1]) ? 0 : 1;
if (res[sel] > 1.01)
{
const u32 kernel_accel_new = kernel_accel * fact[sel];
const u32 kernel_threads_new = fun[sel];
if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max))
{ {
// we can't check kernel_threads because that is for sure outside the range fun[0] = kernel_threads >> 1;
fun[1] = kernel_threads << 1;
}
kernel_accel = kernel_accel_new; float fact[2];
kernel_threads = kernel_threads_new;
fact[0] = (float) kernel_threads / fun[0];
fact[1] = (float) kernel_threads / fun[1];
float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
float res[2] = { 0 };
for (int i = 0; i < 2; i++)
{
const u32 kernel_threads_test = fun[i];
const u32 kernel_accel_test = kernel_accel * fact[i];
if (kernel_accel_test == 0) continue;
if (kernel_threads_test == 0) continue;
if (kernel_threads_test > device_param->device_maxworkgroup_size) continue;
const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel_test, kernel_loops, kernel_threads_test, 2);
res[i] = ms_prev / ms;
}
const int sel = (res[0] > res[1]) ? 0 : 1;
if (res[sel] > 1.01)
{
const u32 kernel_accel_new = kernel_accel * fact[sel];
const u32 kernel_threads_new = fun[sel];
if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max))
{
// we can't check kernel_threads because that is for sure outside the range
kernel_accel = kernel_accel_new;
kernel_threads = kernel_threads_new;
}
} }
} }
} }

View File

@ -15996,49 +15996,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
// check if there's enough host memory left for upcoming allocations, otherwise reduce skip device and present user an option to deal with // check if there's enough host memory left for upcoming allocations, otherwise reduce skip device and present user an option to deal with
u64 accel_limit = 0; u64 accel_limit_host = 0;
get_free_memory (&accel_limit); if (get_free_memory (&accel_limit_host) == false)
// in slow candidate mode we need to keep the buffers on the host alive
// a high accel value doesn't help much here anyway
if (user_options->slow_candidates == true)
{ {
// Tested with NTLM, almost no difference in performance const u64 GiB4 = 4ULL * 1024 * 1024 * 1024;
accel_limit /= 8; event_log_warning (hashcat_ctx, "Couldn't query the OS for free memory, assuming 4GiB");
accel_limit_host = GiB4;
} }
else
// this is device_processors * kernel_threads
accel_limit /= hardware_power_max;
// single password candidate size
accel_limit /= sizeof (pw_t);
// pws[], pws_comp[] and pw_pre[] are some large blocks with password candidates
accel_limit /= 3;
// Is possible that the GPU simply has too much hardware resources and 8GB per GPU is not enough, but OTOH we can't get lower than 1
accel_limit = MAX (accel_limit, 1);
// I think vector size is not required because vector_size is dividing the pws_cnt in run_kernel()
kernel_accel_max = MIN (kernel_accel_max, accel_limit);
if (kernel_accel_min > kernel_accel_max)
{ {
event_log_error (hashcat_ctx, "* Device #%u: Not enough host memory left for this device, skipping...", device_id + 1); // even tho let's not be greedy
event_log_error (hashcat_ctx, " Retry with lower --kernel-threads value.");
backend_kernel_accel_warnings++; const u64 GiB8 = 8ULL * 1024 * 1024 * 1024;
device_param->skipped_warning = true; accel_limit_host = MIN (accel_limit_host, GiB8);
continue;
} }
// Opposite direction check: find out if we would request too much memory on memory blocks which are based on kernel_accel // Opposite direction check: find out if we would request too much memory on memory blocks which are based on kernel_accel
@ -16184,6 +16158,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (size_kernel_params > undocumented_single_allocation_apple) memory_limit_hit = 1; if (size_kernel_params > undocumented_single_allocation_apple) memory_limit_hit = 1;
} }
const u64 size_device_extra = (1024 * 1024 * 1024);
const u64 size_total const u64 size_total
= bitmap_ctx->bitmap_size = bitmap_ctx->bitmap_size
+ bitmap_ctx->bitmap_size + bitmap_ctx->bitmap_size
@ -16220,9 +16196,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
+ size_st_salts + size_st_salts
+ size_st_esalts + size_st_esalts
+ size_kernel_params + size_kernel_params
+ size_spilling; + size_spilling
+ size_device_extra;
if ((size_total + EXTRA_SPACE) > device_param->device_available_mem) memory_limit_hit = 1; if ((size_total + EXTRA_SPACE) > MIN (device_param->device_available_mem, device_param->device_maxmem_alloc)) memory_limit_hit = 1;
if (memory_limit_hit == 1) if (memory_limit_hit == 1)
{ {
@ -16231,6 +16208,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
continue; continue;
} }
const u64 size_host_extra = (512 * 1024 * 1024);
const u64 size_total_host const u64 size_total_host
= size_pws_comp = size_pws_comp
+ size_pws_idx + size_pws_idx
@ -16240,7 +16219,17 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
+ size_brain_link_out + size_brain_link_out
#endif #endif
+ size_pws_pre + size_pws_pre
+ size_pws_base; + size_pws_base
+ size_host_extra;
if ((size_total_host + EXTRA_SPACE) > accel_limit_host) memory_limit_hit = 1;
if (memory_limit_hit == 1)
{
kernel_accel_max--;
continue;
}
size_total_host_all += size_total_host; size_total_host_all += size_total_host;
@ -16249,7 +16238,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (kernel_accel_max < kernel_accel_min) if (kernel_accel_max < kernel_accel_min)
{ {
event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory for this attack.", device_id + 1); event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory or free host memory for mapping.", device_id + 1);
backend_memory_hit_warnings++; backend_memory_hit_warnings++;

View File

@ -25,6 +25,7 @@ static const u64 KERN_TYPE = 72000;
static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE;
static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE
| OPTS_TYPE_PT_GENERATE_LE | OPTS_TYPE_PT_GENERATE_LE
| OPTS_TYPE_AUTODETECT_DISABLE
| OPTS_TYPE_NATIVE_THREADS | OPTS_TYPE_NATIVE_THREADS
| OPTS_TYPE_MP_MULTI_DISABLE; | OPTS_TYPE_MP_MULTI_DISABLE;
static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED;

View File

@ -25,6 +25,7 @@ static const u64 KERN_TYPE = 73000;
static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE;
static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE
| OPTS_TYPE_PT_GENERATE_LE | OPTS_TYPE_PT_GENERATE_LE
| OPTS_TYPE_AUTODETECT_DISABLE
| OPTS_TYPE_NATIVE_THREADS | OPTS_TYPE_NATIVE_THREADS
| OPTS_TYPE_MP_MULTI_DISABLE; | OPTS_TYPE_MP_MULTI_DISABLE;
static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED;

View File

@ -21,13 +21,12 @@
#endif #endif
#if defined (_WIN) #if defined (_WIN)
#include <windows.h>
#include <winsock2.h> #include <winsock2.h>
#endif #endif
#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MSYS__) #if defined (_POSIX)
#else
#include <sys/utsname.h> #include <sys/utsname.h>
#include <sys/sysinfo.h>
#endif #endif
static const char *const PA_000 = "OK"; static const char *const PA_000 = "OK";
@ -1462,13 +1461,13 @@ int generic_salt_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, const u8 *
return tmp_len; return tmp_len;
} }
int get_current_arch() int get_current_arch ()
{ {
#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MSYS__) #if defined (_WIN)
SYSTEM_INFO sysinfo; SYSTEM_INFO sysinfo;
GetNativeSystemInfo(&sysinfo); GetNativeSystemInfo (&sysinfo);
switch (sysinfo.wProcessorArchitecture) switch (sysinfo.wProcessorArchitecture)
{ {
@ -1623,7 +1622,7 @@ void restore_stderr (int saved_fd)
bool get_free_memory (u64 *free_mem) bool get_free_memory (u64 *free_mem)
{ {
#if defined(_WIN) || defined(__CYGWIN__) || defined(__MSYS__) #if defined (_WIN)
MEMORYSTATUSEX memStatus; MEMORYSTATUSEX memStatus;
@ -1640,7 +1639,7 @@ bool get_free_memory (u64 *free_mem)
return false; return false;
} }
#elif defined(__APPLE__) #elif defined (__APPLE__)
mach_port_t host_port = mach_host_self (); mach_port_t host_port = mach_host_self ();
@ -1661,43 +1660,51 @@ bool get_free_memory (u64 *free_mem)
return true; return true;
#elif defined(__linux__) #else
FILE *fp = fopen ("/proc/meminfo", "r"); struct sysinfo info;
if (fp == NULL) return false; if (sysinfo (&info) != 0) return false;
char line[256]; *free_mem = (u64) info.freeram * info.mem_unit;
u64 memFree = 0;
u64 buffers = 0;
u64 cached = 0;
while (fgets (line, sizeof (line), fp))
{
if (sscanf (line, "MemFree: %lu kB", &memFree) == 1)
{
continue;
}
else if (sscanf (line, "Buffers: %lu kB", &buffers) == 1)
{
continue;
}
else if (sscanf (line, "Cached: %lu kB", &cached) == 1)
{
continue;
}
}
fclose (fp);
*free_mem = (memFree + buffers + cached) * 1024;
return true; return true;
#else
return false;
#endif #endif
} }
u32 previous_power_of_two (const u32 x)
{
// https://stackoverflow.com/questions/2679815/previous-power-of-2
// really cool!
if (x == 0) return 0;
u32 r = x;
r |= (r >> 1);
r |= (r >> 2);
r |= (r >> 4);
r |= (r >> 8);
r |= (r >> 16);
return r - (r >> 1);
}
u32 next_power_of_two (const u32 x)
{
if (x == 0) return 1;
u32 r = x - 1;
r |= (r >> 1);
r |= (r >> 2);
r |= (r >> 4);
r |= (r >> 8);
r |= (r >> 16);
r++;
return r;
}

View File

@ -15,8 +15,8 @@ my $kernels_cache = "$hashcat_path/kernels";
my $hashcat_bin = "$hashcat_path/hashcat"; my $hashcat_bin = "$hashcat_path/hashcat";
my $device = 1; my $device = 1;
my $workload_profile = 3; my $workload_profile = 3;
my $runtime = 24; my $runtime = 11;
my $sleep_sec = 12; my $sleep_sec = 13;
my $default_mask = "?a?a?a?a?a?a?a"; my $default_mask = "?a?a?a?a?a?a?a";
my $result = "result.txt"; my $result = "result.txt";
my $old_hashcat = 0; # requires to have ran with new hashcat before to create the hashfiles my $old_hashcat = 0; # requires to have ran with new hashcat before to create the hashfiles
@ -34,8 +34,8 @@ if ($cpu_benchmark == 1)
} }
else else
{ {
#system ("rocm-smi --resetprofile --resetclocks --resetfans"); #system ("rocm-smi --resetprofile --resetclocks --resetfans");
system ("rocm-smi --setfan 100% --setperflevel high"); #system ("rocm-smi --setfan 100% --setperflevel high");
system ("nvidia-settings -a GPUPowerMizerMode=1 -a GPUFanControlState=1 -a GPUTargetFanSpeed=100"); system ("nvidia-settings -a GPUPowerMizerMode=1 -a GPUFanControlState=1 -a GPUTargetFanSpeed=100");
} }