Increase default iteration count per kernel invocation from 1024 to 2048

Add support for lower iteration counts per kernel invocation than the default, enabling TMTO for low scrypt configurations, such as N=1024 Use TMTO 2 if it reaches 4 times the device processor count, instead of TMTO 1 always Improve performance for low scrypt configurations (hash-mode 9300) Fix unit test for 15700 with correct scrypt configurations Disable CPU over subscription for SCRYPT based algorithms
2025-08-01 19:38:26 +00:00 · 2025-06-15 21:14:40 +02:00 · 2025-06-15 21:14:40 +02:00 · e134564a73
commit e134564a73
parent 4d2485db0f
4 changed files with 55 additions and 45 deletions
--- a/OpenCL/inc_hash_scrypt.cl
+++ b/OpenCL/inc_hash_scrypt.cl
@ -345,9 +345,11 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
    case 3: V = V3; break;
  }

-  // note: fixed 1024 iterations = forced -u 1024
+  // note: max 2048 iterations = forced -u 2048

-  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
+  const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048;
+
+  for (u32 N_pos = 0; N_pos < N_max; N_pos++)
  {
    const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);

--- a/OpenCL/inc_hash_scrypt.h
+++ b/OpenCL/inc_hash_scrypt.h
@ -26,8 +26,20 @@ inline __device__ uint4 operator +  (const uint4  a, const uint4 b) { return mak
 inline __device__ uint4 operator ^  (const uint4  a, const uint4 b) { return make_uint4 ((a.x ^  b.x), (a.y ^  b.y), (a.z ^  b.z), (a.w ^  b.w));  }
 inline __device__ uint4 operator |  (const uint4  a, const uint4 b) { return make_uint4 ((a.x |  b.x), (a.y |  b.y), (a.z |  b.z), (a.w |  b.w));  }
 inline __device__ void  operator ^= (      uint4 &a, const uint4 b) {                     a.x ^= b.x;   a.y ^= b.y;   a.z ^= b.z;   a.w ^= b.w;    }
+#endif

-inline __device__ uint4 rotate (const uint4 a, const int n) { return ((a << n) | ((a >> (32 - n)))); }
+#if defined IS_CUDA || defined IS_HIP
+inline __device__ uint4 rotate (const uint4 a, const int n)
+{
+  uint4 r;
+
+  r.x = hc_rotl32_S (r.x, n);
+  r.y = hc_rotl32_S (r.y, n);
+  r.z = hc_rotl32_S (r.z, n);
+  r.w = hc_rotl32_S (r.w, n);
+
+  return r;
+}
 #endif

 #endif
--- a/src/modules/scrypt_common.c
+++ b/src/modules/scrypt_common.c
@ -10,14 +10,14 @@

 u32 scrypt_module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1024;
+  const u32 kernel_loops_min = 2048;

  return kernel_loops_min;
 }

 u32 scrypt_module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1024;
+  const u32 kernel_loops_max = 2048;

  return kernel_loops_max;
 }
@ -84,7 +84,8 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
  }
  else
  {
-    // find a nice kernel_accel programmatically
+    // find a nice kernel_accel for gpus programmatically
+    // on cpus there's no need for over subscription with scrypt

    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
    {
@ -92,45 +93,51 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
      {
        const float multi = (float) available_mem / size_per_accel;

-        int accel_multi;
-
-        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        if ((multi * 2) >= device_processors)
        {
-          kernel_accel_new = multi * (1 << accel_multi);
+          kernel_accel_new  = multi * 2;
+          kernel_accel_new -= 2;

-          if (kernel_accel_new >= device_processors) break;
+          if ((multi * 4) >= device_processors * 2)
+          {
+            kernel_accel_new  = multi * 4;
+            kernel_accel_new -= 4;
+          }
+        }
+        else if ((multi * 4) >= device_processors)
+        {
+          kernel_accel_new  = multi * 4;
+          kernel_accel_new -= 4;
        }
-
-        // we need some space for tmps[], ...
-
-        kernel_accel_new -= (1 << accel_multi);

        // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090

-        if ((kernel_accel_new > device_processors) && ((device_processors * 1.16) > kernel_accel_new))
+        if (kernel_accel_new > device_processors)
        {
-          kernel_accel_new = device_processors;
+          const u32 extra = kernel_accel_new % device_processors;
+
+          if (extra < (device_processors * 0.16))
+          {
+            kernel_accel_new -= extra;
+          }
        }
      }
      else
      {
-        for (int i = 1; i <= 8; i++)
+        u64 multi = available_mem / size_per_accel;
+
+        if (tmto == 0)
        {
-          if ((size_per_accel * device_processors * i) < available_mem)
-          {
-            kernel_accel_new = device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      for (int i = 1; i <= 8; i++)
-      {
-        if ((size_per_accel * device_processors * i) < available_mem)
-        {
-          kernel_accel_new = device_processors * i;
+          tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto
+
+          multi = available_mem / (size_per_accel >> tmto);
        }
+
+        multi /= device_processors;
+        multi -= 4;
+        multi  = MIN (16, multi);
+
+        kernel_accel_new = device_processors * multi; // should be safe because of tmto
      }
    }
  }
@ -144,9 +151,6 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t

    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
    {
-      // we have 1024 hard-coded in the kernel
-      if ((scrypt_N / (1 << tmto_new)) < 1024) continue;
-
      // global memory check
      if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue;

@ -187,14 +191,6 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig

  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  // We must maintain at least 1024 iteration it's hard-coded in the kernel
-  if ((scrypt_N / (1 << tmto)) < 1024)
-  {
-    fprintf (stderr, "ERROR: SCRYPT-N parameter too low. Invalid tmto specified?\n");
-
-    return -1;
-  }
-
  return size_scrypt / (1 << tmto);
 }

--- a/tools/test_modules/m15700.pm
+++ b/tools/test_modules/m15700.pm
@ -17,8 +17,8 @@ sub module_generate_hash
 {
  my $word       = shift;
  my $salt       = shift;
-  my $scrypt_N   = shift || 1024 ; # 262144 originally
-  my $scrypt_r   = shift || 1; # 8 originally
+  my $scrypt_N   = shift || 262144;
+  my $scrypt_r   = shift || 8;
  my $scrypt_p   = shift || 1;
  my $ciphertext = shift || random_bytes (32);