From e134564a731885ad8e8b56c87ca61c2b53961359 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sun, 15 Jun 2025 21:14:40 +0200
Subject: [PATCH] Increase default iteration count per kernel invocation from
 1024 to 2048 Add support for lower iteration counts per kernel invocation
 than the default, enabling TMTO for low scrypt configurations, such as N=1024
 Use TMTO 2 if it reaches 4 times the device processor count, instead of TMTO
 1 always Improve performance for low scrypt configurations (hash-mode 9300)
 Fix unit test for 15700 with correct scrypt configurations Disable CPU over
 subscription for SCRYPT based algorithms

---
 OpenCL/inc_hash_scrypt.cl    |  6 ++-
 OpenCL/inc_hash_scrypt.h     | 14 ++++++-
 src/modules/scrypt_common.c  | 76 +++++++++++++++++-------------------
 tools/test_modules/m15700.pm |  4 +-
 4 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/OpenCL/inc_hash_scrypt.cl b/OpenCL/inc_hash_scrypt.cl
index 6c59404a0..9662e330c 100644
--- a/OpenCL/inc_hash_scrypt.cl
+++ b/OpenCL/inc_hash_scrypt.cl
@@ -345,9 +345,11 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
     case 3: V = V3; break;
   }
 
-  // note: fixed 1024 iterations = forced -u 1024
+  // note: max 2048 iterations = forced -u 2048
 
-  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
+  const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048;
+
+  for (u32 N_pos = 0; N_pos < N_max; N_pos++)
   {
     const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);
 
diff --git a/OpenCL/inc_hash_scrypt.h b/OpenCL/inc_hash_scrypt.h
index 3ae019a6e..a41b09def 100644
--- a/OpenCL/inc_hash_scrypt.h
+++ b/OpenCL/inc_hash_scrypt.h
@@ -26,8 +26,20 @@ inline __device__ uint4 operator +  (const uint4  a, const uint4 b) { return mak
 inline __device__ uint4 operator ^  (const uint4  a, const uint4 b) { return make_uint4 ((a.x ^  b.x), (a.y ^  b.y), (a.z ^  b.z), (a.w ^  b.w));  }
 inline __device__ uint4 operator |  (const uint4  a, const uint4 b) { return make_uint4 ((a.x |  b.x), (a.y |  b.y), (a.z |  b.z), (a.w |  b.w));  }
 inline __device__ void  operator ^= (      uint4 &a, const uint4 b) {                     a.x ^= b.x;   a.y ^= b.y;   a.z ^= b.z;   a.w ^= b.w;    }
+#endif
 
-inline __device__ uint4 rotate (const uint4 a, const int n) { return ((a << n) | ((a >> (32 - n)))); }
+#if defined IS_CUDA || defined IS_HIP
+inline __device__ uint4 rotate (const uint4 a, const int n)
+{
+  uint4 r;
+
+  r.x = hc_rotl32_S (r.x, n);
+  r.y = hc_rotl32_S (r.y, n);
+  r.z = hc_rotl32_S (r.z, n);
+  r.w = hc_rotl32_S (r.w, n);
+
+  return r;
+}
 #endif
 
 #endif
diff --git a/src/modules/scrypt_common.c b/src/modules/scrypt_common.c
index baf811d46..059f2fca8 100644
--- a/src/modules/scrypt_common.c
+++ b/src/modules/scrypt_common.c
@@ -10,14 +10,14 @@
 
 u32 scrypt_module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1024;
+  const u32 kernel_loops_min = 2048;
 
   return kernel_loops_min;
 }
 
 u32 scrypt_module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1024;
+  const u32 kernel_loops_max = 2048;
 
   return kernel_loops_max;
 }
@@ -84,7 +84,8 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
   }
   else
   {
-    // find a nice kernel_accel programmatically
+    // find a nice kernel_accel for gpus programmatically
+    // on cpus there's no need for over subscription with scrypt
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
@@ -92,45 +93,51 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
       {
         const float multi = (float) available_mem / size_per_accel;
 
-        int accel_multi;
-
-        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        if ((multi * 2) >= device_processors)
         {
-          kernel_accel_new = multi * (1 << accel_multi);
+          kernel_accel_new  = multi * 2;
+          kernel_accel_new -= 2;
 
-          if (kernel_accel_new >= device_processors) break;
+          if ((multi * 4) >= device_processors * 2)
+          {
+            kernel_accel_new  = multi * 4;
+            kernel_accel_new -= 4;
+          }
+        }
+        else if ((multi * 4) >= device_processors)
+        {
+          kernel_accel_new  = multi * 4;
+          kernel_accel_new -= 4;
         }
-
-        // we need some space for tmps[], ...
-
-        kernel_accel_new -= (1 << accel_multi);
 
         // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
 
-        if ((kernel_accel_new > device_processors) && ((device_processors * 1.16) > kernel_accel_new))
+        if (kernel_accel_new > device_processors)
         {
-          kernel_accel_new = device_processors;
+          const u32 extra = kernel_accel_new % device_processors;
+
+          if (extra < (device_processors * 0.16))
+          {
+            kernel_accel_new -= extra;
+          }
         }
       }
       else
       {
-        for (int i = 1; i <= 8; i++)
+        u64 multi = available_mem / size_per_accel;
+
+        if (tmto == 0)
         {
-          if ((size_per_accel * device_processors * i) < available_mem)
-          {
-            kernel_accel_new = device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      for (int i = 1; i <= 8; i++)
-      {
-        if ((size_per_accel * device_processors * i) < available_mem)
-        {
-          kernel_accel_new = device_processors * i;
+          tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto
+
+          multi = available_mem / (size_per_accel >> tmto);
         }
+
+        multi /= device_processors;
+        multi -= 4;
+        multi  = MIN (16, multi);
+
+        kernel_accel_new = device_processors * multi; // should be safe because of tmto
       }
     }
   }
@@ -144,9 +151,6 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
 
     for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
     {
-      // we have 1024 hard-coded in the kernel
-      if ((scrypt_N / (1 << tmto_new)) < 1024) continue;
-
       // global memory check
       if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue;
 
@@ -187,14 +191,6 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig
 
   u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  // We must maintain at least 1024 iteration it's hard-coded in the kernel
-  if ((scrypt_N / (1 << tmto)) < 1024)
-  {
-    fprintf (stderr, "ERROR: SCRYPT-N parameter too low. Invalid tmto specified?\n");
-
-    return -1;
-  }
-
   return size_scrypt / (1 << tmto);
 }
 
diff --git a/tools/test_modules/m15700.pm b/tools/test_modules/m15700.pm
index 896ce66e5..abee738bf 100644
--- a/tools/test_modules/m15700.pm
+++ b/tools/test_modules/m15700.pm
@@ -17,8 +17,8 @@ sub module_generate_hash
 {
   my $word       = shift;
   my $salt       = shift;
-  my $scrypt_N   = shift || 1024 ; # 262144 originally
-  my $scrypt_r   = shift || 1; # 8 originally
+  my $scrypt_N   = shift || 262144;
+  my $scrypt_r   = shift || 8;
   my $scrypt_p   = shift || 1;
   my $ciphertext = shift || random_bytes (32);