From 62a5a85dd6565e118191ed68bca854c09068dfcb Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Wed, 25 Jun 2025 11:21:51 +0200
Subject: [PATCH] Added 'next_power_of_two()' and moved both
 'next_power_of_two()' and 'previous_power_of_two()' to 'shared.c' Improved
 autotuner tweak logic and added boundary checks for accel and threads Fixed
 available host memory detection on Windows Fixed compilation error in MSYS2
 native shell Introduced an 8 GiB host memory usage limit per GPU, even if
 more is available Replaced fixed-size host memory detection per GPU with a
 dynamic kernel-accel based method (similar to GPU memory detection) Disabled
 hash-mode autodetection in the python bridge Removed default invocation of
 'rocm-smi' in 'benchmark_deep.pl' to avoid skewed initial results Reduced
 default runtime in 'benchmark_deep.pl' scripts due to improved benchmark
 accuracy in hashcat in general

---
 include/shared.h           |   3 +
 src/autotune.c             | 164 ++++++++++++-------------------------
 src/backend.c              |  69 +++++++---------
 src/modules/module_72000.c |   1 +
 src/modules/module_73000.c |   1 +
 src/shared.c               |  87 +++++++++++---------
 tools/benchmark_deep.pl    |   8 +-
 7 files changed, 139 insertions(+), 194 deletions(-)

diff --git a/include/shared.h b/include/shared.h
index 013cb1157..ae968281c 100644
--- a/include/shared.h
+++ b/include/shared.h
@@ -122,4 +122,7 @@ void restore_stderr (int saved_fd);
 
 bool get_free_memory (u64 *free_mem);
 
+u32 previous_power_of_two (const u32 x);
+u32 next_power_of_two (const u32 x);
+
 #endif // HC_SHARED_H
diff --git a/src/autotune.c b/src/autotune.c
index cfde1abed..8030d878a 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -8,6 +8,7 @@
 #include "event.h"
 #include "backend.h"
 #include "status.h"
+#include "shared.h"
 #include "autotune.h"
 
 int find_tuning_function (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param)
@@ -95,46 +96,6 @@ static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
   return exec_msec_best;
 }
 
-static bool is_power_of_2 (const u32 n)
-{
- return n != 0 && (n & (n - 1)) == 0;
-}
-
-static u32 previous_power_of_two (const u32 x)
-{
-  // https://stackoverflow.com/questions/2679815/previous-power-of-2
-  // really cool!
-
-  if (x == 0) return 0;
-
-  u32 r = x;
-
-  r |= (r >>  1);
-  r |= (r >>  2);
-  r |= (r >>  4);
-  r |= (r >>  8);
-  r |= (r >> 16);
-
-  return r - (r >> 1);
-}
-
-static u32 next_power_of_two (const u32 x)
-{
-  if (x == 0) return 1;
-
-  u32 r = x - 1;
-
-  r |= (r >>  1);
-  r |= (r >>  2);
-  r |= (r >>  4);
-  r |= (r >>  8);
-  r |= (r >> 16);
-
-  r++;
-
-  return r;
-}
-
 static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
   const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
@@ -434,87 +395,70 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     }
 
     if (kernel_accel > 64) kernel_accel -= kernel_accel % 32;
-  }
 
-  // some final play, if we have strange numbers from the APIs, namely 96, 384, and such
-
-  if (is_power_of_2 (kernel_threads) == false)
-  {
-    u32 fun[2];
-
-    fun[0] = previous_power_of_two (kernel_threads);
-    fun[1] = next_power_of_two (kernel_threads);
-
-    float fact[2];
-
-    fact[0] = (float) kernel_threads / fun[0];
-    fact[1] = (float) kernel_threads / fun[1];
-
-    float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
-
-    float res[2];
-
-    for (int i = 0; i < 2; i++)
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
     {
-      const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel * fact[i], kernel_loops, fun[i], 2);
-
-      res[i] = ms_prev / ms;
+      if (kernel_accel > device_param->device_processors) kernel_accel -= kernel_accel % device_param->device_processors;
     }
 
-    const int sel = (res[0] > res[1]) ? 0 : 1;
+    // some final play, if we have strange numbers from the APIs, namely 96, 384, and such
 
-    if (res[sel] > 1.01)
+    if ((kernel_accel_min == kernel_accel_max) || (kernel_threads_min == kernel_threads_max))
     {
-      const u32 kernel_accel_new = kernel_accel * fact[sel];
-      const u32 kernel_threads_new = fun[sel];
+    }
+    else
+    {
+      u32 fun[2];
 
-      if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max))
+      if (is_power_of_2 (kernel_threads) == false)
       {
-        // we can't check kernel_threads because that is for sure outside the range
-
-        kernel_accel = kernel_accel_new;
-        kernel_threads = kernel_threads_new;
+        fun[0] = previous_power_of_two (kernel_threads);
+        fun[1] = next_power_of_two (kernel_threads);
       }
-    }
-  }
-  else
-  {
-    // that's also nice
-
-    u32 fun[2];
-
-    fun[0] = kernel_threads >> 1;
-    fun[1] = kernel_threads << 1;
-
-    float fact[2];
-
-    fact[0] = (float) kernel_threads / fun[0];
-    fact[1] = (float) kernel_threads / fun[1];
-
-    float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
-
-    float res[2];
-
-    for (int i = 0; i < 2; i++)
-    {
-      const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel * fact[i], kernel_loops, fun[i], 2);
-
-      res[i] = ms_prev / ms;
-    }
-
-    const int sel = (res[0] > res[1]) ? 0 : 1;
-
-    if (res[sel] > 1.01)
-    {
-      const u32 kernel_accel_new = kernel_accel * fact[sel];
-      const u32 kernel_threads_new = fun[sel];
-
-      if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max))
+      else
       {
-        // we can't check kernel_threads because that is for sure outside the range
+        fun[0] = kernel_threads >> 1;
+        fun[1] = kernel_threads << 1;
+      }
 
-        kernel_accel = kernel_accel_new;
-        kernel_threads = kernel_threads_new;
+      float fact[2];
+
+      fact[0] = (float) kernel_threads / fun[0];
+      fact[1] = (float) kernel_threads / fun[1];
+
+      float ms_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
+
+      float res[2] = { 0 };
+
+      for (int i = 0; i < 2; i++)
+      {
+        const u32 kernel_threads_test = fun[i];
+        const u32 kernel_accel_test =  kernel_accel * fact[i];
+
+        if (kernel_accel_test == 0) continue;
+        if (kernel_threads_test == 0) continue;
+
+        if (kernel_threads_test > device_param->device_maxworkgroup_size) continue;
+
+        const float ms = try_run_times (hashcat_ctx, device_param, kernel_accel_test, kernel_loops, kernel_threads_test, 2);
+
+        res[i] = ms_prev / ms;
+      }
+
+      const int sel = (res[0] > res[1]) ? 0 : 1;
+
+      if (res[sel] > 1.01)
+      {
+        const u32 kernel_accel_new = kernel_accel * fact[sel];
+        const u32 kernel_threads_new = fun[sel];
+
+        if ((kernel_accel_new >= kernel_accel_min) && (kernel_accel_new <= kernel_accel_max))
+        {
+          // we can't check kernel_threads because that is for sure outside the range
+
+          kernel_accel = kernel_accel_new;
+          kernel_threads = kernel_threads_new;
+        }
       }
     }
   }
diff --git a/src/backend.c b/src/backend.c
index a71bac223..cc0be63ad 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -15996,49 +15996,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     // check if there's enough host memory left for upcoming allocations, otherwise reduce skip device and present user an option to deal with
 
-    u64 accel_limit = 0;
+    u64 accel_limit_host = 0;
 
-    get_free_memory (&accel_limit);
-
-    // in slow candidate mode we need to keep the buffers on the host alive
-    // a high accel value doesn't help much here anyway
-
-    if (user_options->slow_candidates == true)
+    if (get_free_memory (&accel_limit_host) == false)
     {
-      // Tested with NTLM, almost no difference in performance
+      const u64 GiB4 = 4ULL * 1024 * 1024 * 1024;
 
-      accel_limit /= 8;
+      event_log_warning (hashcat_ctx, "Couldn't query the OS for free memory, assuming 4GiB");
+
+      accel_limit_host = GiB4;
     }
-
-    // this is device_processors * kernel_threads
-
-    accel_limit /= hardware_power_max;
-
-    // single password candidate size
-
-    accel_limit /= sizeof (pw_t);
-
-    // pws[], pws_comp[] and pw_pre[] are some large blocks with password candidates
-
-    accel_limit /= 3;
-
-    // Is possible that the GPU simply has too much hardware resources and 8GB per GPU is not enough, but OTOH we can't get lower than 1
-
-    accel_limit = MAX (accel_limit, 1);
-
-    // I think vector size is not required because vector_size is dividing the pws_cnt in run_kernel()
-
-    kernel_accel_max = MIN (kernel_accel_max, accel_limit);
-
-    if (kernel_accel_min > kernel_accel_max)
+    else
     {
-      event_log_error (hashcat_ctx, "* Device #%u: Not enough host memory left for this device, skipping...", device_id + 1);
-      event_log_error (hashcat_ctx, "             Retry with lower --kernel-threads value.");
+      // even tho let's not be greedy
 
-      backend_kernel_accel_warnings++;
+      const u64 GiB8 = 8ULL * 1024 * 1024 * 1024;
 
-      device_param->skipped_warning = true;
-      continue;
+      accel_limit_host = MIN (accel_limit_host, GiB8);
     }
 
     // Opposite direction check: find out if we would request too much memory on memory blocks which are based on kernel_accel
@@ -16184,6 +16158,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         if (size_kernel_params      > undocumented_single_allocation_apple) memory_limit_hit = 1;
       }
 
+      const u64 size_device_extra = (1024 * 1024 * 1024);
+
       const u64 size_total
         = bitmap_ctx->bitmap_size
         + bitmap_ctx->bitmap_size
@@ -16220,9 +16196,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         + size_st_salts
         + size_st_esalts
         + size_kernel_params
-        + size_spilling;
+        + size_spilling
+        + size_device_extra;
 
-      if ((size_total + EXTRA_SPACE) > device_param->device_available_mem) memory_limit_hit = 1;
+      if ((size_total + EXTRA_SPACE) > MIN (device_param->device_available_mem, device_param->device_maxmem_alloc)) memory_limit_hit = 1;
 
       if (memory_limit_hit == 1)
       {
@@ -16231,6 +16208,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         continue;
       }
 
+      const u64 size_host_extra = (512 * 1024 * 1024);
+
       const u64 size_total_host
         = size_pws_comp
         + size_pws_idx
@@ -16240,7 +16219,17 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         + size_brain_link_out
         #endif
         + size_pws_pre
-        + size_pws_base;
+        + size_pws_base
+        + size_host_extra;
+
+      if ((size_total_host + EXTRA_SPACE) > accel_limit_host) memory_limit_hit = 1;
+
+      if (memory_limit_hit == 1)
+      {
+        kernel_accel_max--;
+
+        continue;
+      }
 
       size_total_host_all += size_total_host;
 
@@ -16249,7 +16238,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (kernel_accel_max < kernel_accel_min)
     {
-      event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory for this attack.", device_id + 1);
+      event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory or free host memory for mapping.", device_id + 1);
 
       backend_memory_hit_warnings++;
 
diff --git a/src/modules/module_72000.c b/src/modules/module_72000.c
index a1268759e..d9995764c 100644
--- a/src/modules/module_72000.c
+++ b/src/modules/module_72000.c
@@ -25,6 +25,7 @@ static const u64   KERN_TYPE      = 72000;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
+                                  | OPTS_TYPE_AUTODETECT_DISABLE
                                   | OPTS_TYPE_NATIVE_THREADS
                                   | OPTS_TYPE_MP_MULTI_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
diff --git a/src/modules/module_73000.c b/src/modules/module_73000.c
index f8516ce00..351893f26 100644
--- a/src/modules/module_73000.c
+++ b/src/modules/module_73000.c
@@ -25,6 +25,7 @@ static const u64   KERN_TYPE      = 73000;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
+                                  | OPTS_TYPE_AUTODETECT_DISABLE
                                   | OPTS_TYPE_NATIVE_THREADS
                                   | OPTS_TYPE_MP_MULTI_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
diff --git a/src/shared.c b/src/shared.c
index a7d8dca4e..3c4f9c43f 100644
--- a/src/shared.c
+++ b/src/shared.c
@@ -21,13 +21,12 @@
 #endif
 
 #if defined (_WIN)
-#include <windows.h>
 #include <winsock2.h>
 #endif
 
-#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MSYS__)
-#else
+#if defined (_POSIX)
 #include <sys/utsname.h>
+#include <sys/sysinfo.h>
 #endif
 
 static const char *const PA_000 = "OK";
@@ -1462,13 +1461,13 @@ int generic_salt_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, const u8 *
   return tmp_len;
 }
 
-int get_current_arch()
+int get_current_arch ()
 {
-  #if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MSYS__)
+  #if defined (_WIN)
 
   SYSTEM_INFO sysinfo;
 
-  GetNativeSystemInfo(&sysinfo);
+  GetNativeSystemInfo (&sysinfo);
 
   switch (sysinfo.wProcessorArchitecture)
   {
@@ -1623,7 +1622,7 @@ void restore_stderr (int saved_fd)
 
 bool get_free_memory (u64 *free_mem)
 {
-  #if defined(_WIN) || defined(__CYGWIN__) || defined(__MSYS__)
+  #if defined (_WIN)
 
   MEMORYSTATUSEX memStatus;
 
@@ -1640,7 +1639,7 @@ bool get_free_memory (u64 *free_mem)
     return false;
   }
 
-  #elif defined(__APPLE__)
+  #elif defined (__APPLE__)
 
   mach_port_t host_port = mach_host_self ();
 
@@ -1661,43 +1660,51 @@ bool get_free_memory (u64 *free_mem)
 
   return true;
 
-  #elif defined(__linux__)
+  #else
 
-  FILE *fp = fopen ("/proc/meminfo", "r");
+  struct sysinfo info;
 
-  if (fp == NULL) return false;
+  if (sysinfo (&info) != 0) return false;
 
-  char line[256];
-
-  u64 memFree = 0;
-  u64 buffers = 0;
-  u64 cached = 0;
-
-  while (fgets (line, sizeof (line), fp))
-  {
-    if (sscanf (line, "MemFree: %lu kB", &memFree) == 1)
-    {
-      continue;
-    }
-    else if (sscanf (line, "Buffers: %lu kB", &buffers) == 1)
-    {
-      continue;
-    }
-    else if (sscanf (line, "Cached: %lu kB", &cached) == 1)
-    {
-      continue;
-    }
-  }
-
-  fclose (fp);
-
-  *free_mem = (memFree + buffers + cached) * 1024;
+  *free_mem = (u64) info.freeram * info.mem_unit;
 
   return true;
 
-  #else
-
-  return false;
-
   #endif
 }
+
+u32 previous_power_of_two (const u32 x)
+{
+  // https://stackoverflow.com/questions/2679815/previous-power-of-2
+  // really cool!
+
+  if (x == 0) return 0;
+
+  u32 r = x;
+
+  r |= (r >>  1);
+  r |= (r >>  2);
+  r |= (r >>  4);
+  r |= (r >>  8);
+  r |= (r >> 16);
+
+  return r - (r >> 1);
+}
+
+u32 next_power_of_two (const u32 x)
+{
+  if (x == 0) return 1;
+
+  u32 r = x - 1;
+
+  r |= (r >>  1);
+  r |= (r >>  2);
+  r |= (r >>  4);
+  r |= (r >>  8);
+  r |= (r >> 16);
+
+  r++;
+
+  return r;
+}
+
diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl
index b75d371a0..f6369e995 100755
--- a/tools/benchmark_deep.pl
+++ b/tools/benchmark_deep.pl
@@ -15,8 +15,8 @@ my $kernels_cache    = "$hashcat_path/kernels";
 my $hashcat_bin      = "$hashcat_path/hashcat";
 my $device           = 1;
 my $workload_profile = 3;
-my $runtime          = 24;
-my $sleep_sec        = 12;
+my $runtime          = 11;
+my $sleep_sec        = 13;
 my $default_mask     = "?a?a?a?a?a?a?a";
 my $result           = "result.txt";
 my $old_hashcat      = 0; # requires to have ran with new hashcat before to create the hashfiles
@@ -34,8 +34,8 @@ if ($cpu_benchmark == 1)
 }
 else
 {
-	#system ("rocm-smi --resetprofile --resetclocks --resetfans");
-  system ("rocm-smi --setfan 100% --setperflevel high");
+  #system ("rocm-smi --resetprofile --resetclocks --resetfans");
+  #system ("rocm-smi --setfan 100% --setperflevel high");
 
   system ("nvidia-settings -a GPUPowerMizerMode=1 -a GPUFanControlState=1 -a GPUTargetFanSpeed=100");
 }