Trying out a tweak to autotune behavior related to -u loop tuning.

Since loop values increase by doubling in autotune, a slow hash-mode with, for example, 1000 iterations can end up with a suboptimal -u count. Currently, autotuning starts at 1 and doubles (2, 4, 8, ..., 512, 1024). If the maximum is 1000, autotune stops at 512, resulting in two kernel calls: one with 512 iterations and another with 488. The tweak attempts to find the smallest factor that, when repeatedly doubled, reaches the target exactly. For 1000, this would be 125 and for 1024, it would be 1. However, this logic doesn’t align well with how hashcat handles slow hash iterations. For instance, PBKDF2-based plugins typically set the iteration count to N-1, since the first iteration is handled by the `_init` kernel. So, a plugin might set 1023 instead of 1024, and in such cases, the logic would incorrectly assume 1023 is the minimum factor which leads to suboptimal tuning. To work around this, the factor-finder is executed twice: once with the original iteration count and once with `iteration count + 1`. The configuration that results in a lower starting point is used. Other stuff: - Fixed a critical bug in the autotuner This bug was introduced a few days ago. The autotuner has the ability to overtune the maximum allowed thread count under certain conditions. For example, in unoptimized -a 0 cracking mode when using rules. Several parts of the hashcat core require strict adherence to this limit, especially when shared memory is involved. To resolve this while retaining overtuning for compatible modes, a new attribute `device_param->overtune_unfriendly` was introduced. When set to true, it prevents the autotuner from modifying `kernel_threads_max` and `kernel_accel_max`. Four sections in `backend.c` have been updated to set this flag, though additional areas may also require it. - Moved the code that aligns `kernel_accel` to a multiple of the compute unit count into the overtune section. - Fixed a bug in the HIP dynloader. It now reports actual error strings, provided the API returns them.
2025-08-01 19:38:26 +00:00 · 2025-06-27 21:52:57 +02:00 · 2025-06-27 21:52:57 +02:00 · 974934dcdf
commit 974934dcdf
parent bdc47abbe0
6 changed files with 48 additions and 12 deletions
--- a/include/shared.h
+++ b/include/shared.h
@ -33,6 +33,7 @@ bool overflow_check_u64_add (const u64 a, const u64 b);
 bool overflow_check_u64_mul (const u64 a, const u64 b);

 bool is_power_of_2 (const u32 v);
+u32 smallest_repeat_double (const u32 v);

 u32 get_random_num (const u32 min, const u32 max);

--- a/include/types.h
+++ b/include/types.h
@ -1375,6 +1375,8 @@ typedef struct hc_device_param
  u32     kernel_threads_min;
  u32     kernel_threads_max;

+  bool    overtune_unfriendly;  // whatever sets this decide we operate in a mode that is not allowing to overtune threads_max or accel_max in autotuner
+
  u64     kernel_power;
  u64     hardware_power;

--- a/src/autotune.c
+++ b/src/autotune.c
@ -98,6 +98,7 @@ static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi

 static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
+  const hashes_t       *hashes       = hashcat_ctx->hashes;
  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
  const backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
  const straight_ctx_t *straight_ctx = hashcat_ctx->straight_ctx;
@ -329,7 +330,25 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param

    // v7 autotuner is a lot more straight forward

-    for (u32 kernel_loops_test = kernel_loops_min; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
+    u32 kernel_loops_min_start = kernel_loops_min;
+
+    if (hashes && hashes->st_salts_buf)
+    {
+      u32 start = kernel_loops_max;
+
+      start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter));
+      start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter + 1));
+
+      if ((hashes->st_salts_buf->salt_iter     % 125) == 0) start = MIN (start, 125);
+      if ((hashes->st_salts_buf->salt_iter + 1 % 125) == 0) start = MIN (start, 125);
+
+      if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+      {
+        kernel_loops_min_start = start;
+      }
+    }
+
+    for (u32 kernel_loops_test = kernel_loops_min_start; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
    {
      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_test, kernel_threads_min, 2);

@ -401,20 +420,21 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
      if (kernel_accel > kernel_accel_max) kernel_accel = kernel_accel_max;
    }

-    if (kernel_accel > 64) kernel_accel -= kernel_accel % 32;
+    // overtune section. relevant if we have strange numbers from the APIs, namely 96, 384, and such
+    // this is a dangerous action, and we set conditions somewhere in the code to disable this

-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
-    {
-      if (kernel_accel > device_param->device_processors) kernel_accel -= kernel_accel % device_param->device_processors;
-    }
-
-    // some final play, if we have strange numbers from the APIs, namely 96, 384, and such
-
-    if ((kernel_accel_min == kernel_accel_max) || (kernel_threads_min == kernel_threads_max))
+    if ((kernel_accel_min == kernel_accel_max) || (kernel_threads_min == kernel_threads_max) || (device_param->overtune_unfriendly == true))
    {
    }
    else
    {
+      if (kernel_accel > 64) kernel_accel -= kernel_accel % 32;
+
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+      {
+        if (kernel_accel > device_param->device_processors) kernel_accel -= kernel_accel % device_param->device_processors;
+      }
+
      u32 fun[2];

      if (is_power_of_2 (kernel_threads) == false)
--- a/src/backend.c
+++ b/src/backend.c
@ -10532,6 +10532,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

      device_param->kernel_loops_max = MIN (device_param->kernel_loops_max, 1024);  // autotune go over ...
      device_param->kernel_loops_min = MIN (device_param->kernel_loops_min, device_param->kernel_loops_max);
+
+      device_param->overtune_unfriendly = true;
    }
    #endif

@ -11499,6 +11501,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
            {
              device_param->kernel_threads_min = fixed_local_size;
              // device_param->kernel_threads_max = fixed_local_size;
+
+              device_param->overtune_unfriendly = true;
            }
          }
        }
@ -16014,6 +16018,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
        {
          device_param->kernel_threads_min = MIN (device_param->kernel_threads_min, 64);
          device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64);
+
+          device_param->overtune_unfriendly = true;
        }
      }
    }
@ -16032,6 +16038,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

      device_param->kernel_threads_min = MIN (device_param->kernel_threads_min, 64);
      device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64);
+
+      device_param->overtune_unfriendly = true;
    }

    //    device_param->kernel_threads = kernel_threads;
--- a/src/ext_hip.c
+++ b/src/ext_hip.c
@ -133,8 +133,8 @@ int hip_init (void *hashcat_ctx)
  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,             HIP_HIPEVENTRECORD,             HIP, 1);
  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,        HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,        HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipGetErrorName,            HIP_HIPGETERRORNAME,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipGetErrorString,          HIP_HIPGETERRORSTRING,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,         HIP_HIPGETERRORNAME,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,       HIP_HIPGETERRORSTRING,          HIP, 1);
  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                    HIP_HIPINIT,                    HIP, 1);
  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,      HIP_HIPLAUNCHKERNEL,            HIP, 1);
  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                  HIP_HIPMEMALLOC,                HIP, 1);
--- a/src/shared.c
+++ b/src/shared.c
@ -206,6 +206,11 @@ bool is_power_of_2 (const u32 v)
  return (v && !(v & (v - 1)));
 }

+u32 smallest_repeat_double (const u32 v)
+{
+  return (v / (v & -v));
+}
+
 u32 mydivc32 (const u32 dividend, const u32 divisor)
 {
  u32 quotient = dividend / divisor;