Add support for detecting unified GPU memory on CUDA and HIP (previously available only for OpenCL and Metal).

Do not adjust kernel-accel or scrypt-tmto for GPUs with unified memory, typically integrated GPUs in CPUs (APUs). Redesign the "4-buffer" strategy to avoid overallocation from naive division by four, which can significantly increase memory usage for high scrypt configurations (e.g., 256k:8:1). Update the scrypt B[] access pattern to match the new "4-buffer" design. Allow user-specified kernel-accel and scrypt-tmto values, individually or both, via command line and tuning database. Any unspecified parameters are adjusted automatically. Permit user-defined combinations of scrypt-tmto and kernel-accel even if they may exceed available memory.
2025-07-19 13:08:19 +00:00 · 2025-06-17 13:32:57 +02:00 · 2025-06-17 13:32:57 +02:00 · 4b93a6e93c
commit 4b93a6e93c
parent 1ac14903d0
6 changed files with 262 additions and 128 deletions
--- a/OpenCL/inc_comp_multi_iba.cl
+++ b/OpenCL/inc_comp_multi_iba.cl
@ -3,7 +3,7 @@ for (int digest_pos = 0; digest_pos < DIGESTS_CNT; digest_pos++)
 {
  const u32 final_hash_pos = DIGESTS_OFFSET_HOST + digest_pos;

-  const digest_t *digest = digests_buf + final_hash_pos;
+  GLOBAL_AS const digest_t *digest = digests_buf + final_hash_pos;

  const int invalid_bits = count_bits_32 (digest->digest_buf[0], r0)
                         + count_bits_32 (digest->digest_buf[1], r1)
--- a/OpenCL/inc_hash_scrypt.cl
+++ b/OpenCL/inc_hash_scrypt.cl
@ -294,10 +294,12 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
  const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO;
  const u32 zSIZE = STATE_CNT4;

-  const u32 x = (u32) gid;
+  const u64 bid = get_group_id(0);
+  const u64 lsz = get_local_size(0);
+  const u64 lid = get_local_id(0);

-  const u32 xd4 = x / 4;
-  const u32 xm4 = x & 3;
+  const u32 xd4 = bid / 4;
+  const u32 xm4 = bid & 3;

  GLOBAL_AS uint4 *V;

@ -311,7 +313,7 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL

  for (u32 y = 0; y < ySIZE; y++)
  {
-    for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
+    for (u32 z = 0; z < zSIZE; z++) V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)] = X[z];

    #ifdef IS_HIP
    for (u32 i = 0; i < (1 << SCRYPT_TMTO); i++) salsa_r_l ((LOCAL_AS u32 *) X);
@ -330,10 +332,12 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
  const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO;
  const u32 zSIZE = STATE_CNT4;

-  const u32 x = (u32) gid;
+  const u64 bid = get_group_id(0);
+  const u64 lsz = get_local_size(0);
+  const u64 lid = get_local_id(0);

-  const u32 xd4 = x / 4;
-  const u32 xm4 = x & 3;
+  const u32 xd4 = bid / 4;
+  const u32 xm4 = bid & 3;

  GLOBAL_AS uint4 *V;

@ -347,7 +351,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL

  // note: max 2048 iterations = forced -u 2048

-  const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048;
+  const u32 N_max = (SCRYPT_N < 2048) ? SCRYPT_N : 2048;

  for (u32 N_pos = 0; N_pos < N_max; N_pos++)
  {
@ -357,7 +361,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL

    const u32 km = k - (y << SCRYPT_TMTO);

-    for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO];
+    for (u32 z = 0; z < zSIZE; z++) T[z] = V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)];

    #ifdef IS_HIP
    for (u32 i = 0; i < km; i++) salsa_r_l ((LOCAL_AS u32 *) T);
--- a/OpenCL/inc_hash_scrypt.h
+++ b/OpenCL/inc_hash_scrypt.h
@ -15,8 +15,7 @@
 #define STATE_CNT   GET_STATE_CNT  (SCRYPT_R)
 #define STATE_CNT4  (STATE_CNT / 4)

-#define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
-#define CO Coord(xd4,y,z)
+#define VIDX(bid4,lsz,lid,ySIZE,zSIZE,y,z) (((bid4) * (lsz) * (ySIZE) * (zSIZE)) + ((lid) * (ySIZE) * (zSIZE)) + ((y) * (zSIZE)) + (z))

 #if defined IS_CUDA
 inline __device__ uint4 operator &  (const uint4  a, const u32   b) { return make_uint4 ((a.x &  b  ), (a.y &  b  ), (a.z &  b  ), (a.w &  b  ));  }
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@ -104,6 +104,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
  return atomicOr (p, val);
 }

+DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+{
+  return blockIdx.x;
+}
+
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
 {
  return (blockIdx.x * blockDim.x) + threadIdx.x;
@ -280,6 +285,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
  return atomicOr (p, val);
 }

+DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+{
+  return blockIdx.x;
+}
+
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
 {
  return (blockIdx.x * blockDim.x) + threadIdx.x;
--- a/src/backend.c
+++ b/src/backend.c
@ -5305,6 +5305,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)

      hc_string_trim_trailing (device_name);

+      // unified memory
+
+      int device_host_unified_memory = 0;
+
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      device_param->device_host_unified_memory = device_host_unified_memory;
+
      // device_processors

      int device_processors = 0;
@ -5735,6 +5748,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)

      hc_string_trim_trailing (device_name);

+      // unified memory
+
+      int device_host_unified_memory = 0;
+
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, hipDeviceAttributeIntegrated, hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      device_param->device_host_unified_memory = device_host_unified_memory;
+
      // device_processors

      int device_processors = 0;
@ -9705,11 +9731,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

      u32 _kernel_accel = 0;

-      tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+      if (user_options->kernel_accel_chgd == true)
+      {
+        _kernel_accel = user_options->kernel_accel;
+      } 
+      else
+      {
+        tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);

-      if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
-
-      if (user_options->kernel_accel_chgd == true) _kernel_accel = user_options->kernel_accel;
+        if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
+      }

      const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes, device_id, _kernel_accel);

@ -10405,7 +10436,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

    // extra buffer

-    u64 size_extra_buffer = 4;
+    u64 size_extra_buffer1 = 4;
+    u64 size_extra_buffer2 = 4;
+    u64 size_extra_buffer3 = 4;
+    u64 size_extra_buffer4 = 4;

    if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT)
    {
@ -10423,20 +10457,51 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

      device_param->extra_buffer_size = extra_buffer_size;

-      // for the size we actually allocate we need to cheat a bit in order to make it more easy for plugin developer.
-      //
-      // we will divide this size by 4 to workaround opencl limitation.
-      // this collides with a theoretical scenario (like -n1 -T1) where there's only one workitem,
-      // because inside the kernel the target buffer is selected by workitem_id / 4.
-      // but the maximum size of the buffer would be only 1/4 of what is needed -> overflow.
-      //
-      // to workaround this we make sure that there's always a full buffer in each of the 4 allocated buffers available.
+      /**
+       * We use a "4-buffer" strategy for certain hash types (like scrypt)
+       * that require large scratch buffers per work-item.
+       *
+       * The kernel assigns each work-item to one of 4 sub-buffers using:
+       *   buffer index = workitem_id % 4
+       *
+       * This means that each of the 4 sub-buffers must be large enough to hold
+       * all work-items that map to it. However, the total number of work-items
+       * is not always a multiple of 4. If we naively split the total buffer size
+       * evenly into 4 parts, the last chunk may be too small and cause buffer
+       * overflows for configurations where work-items spill into a partially sized chunk.
+       *
+       * Previous versions worked around this by over-allocating a full extra buffer,
+       * but this wasted gpu memory for large hashes like scrypt with high N.
+       *
+       * This improved logic computes the exact number of work-items assigned to
+       * each of the 4 chunks and sizes each chunk precisely:
+       *
+       * - The first 'leftover' chunks get one extra work-item to cover any remainder.
+       * - This guarantees each chunk is large enough for its assigned work-items.
+       */

-      const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+      const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_accel_max;

-      const u64 extra_buffer_size_one = extra_buffer_size / kernel_power_max;
+      const u64 extra_buffer_size_threads = extra_buffer_size / kernel_power_max;

-      size_extra_buffer = extra_buffer_size + (extra_buffer_size_one * 4);
+      const u64 workitems_per_chunk = kernel_power_max / 4;
+
+      const u64 base_chunk_size = workitems_per_chunk * extra_buffer_size_threads;
+
+      size_extra_buffer1 = base_chunk_size;
+      size_extra_buffer2 = base_chunk_size;
+      size_extra_buffer3 = base_chunk_size;
+      size_extra_buffer4 = base_chunk_size;
+
+      const u64 leftover = kernel_power_max % 4;
+
+      switch (leftover)
+      {
+        case 3: size_extra_buffer3 += extra_buffer_size_threads; // fall-through
+        case 2: size_extra_buffer2 += extra_buffer_size_threads; // fall-through
+        case 1: size_extra_buffer1 += extra_buffer_size_threads; // fall-through
+        case 0: break;
+      }
    }

    // kern type
@ -11343,7 +11408,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      + size_shown
      + size_salts
      + size_results
-      + size_extra_buffer
+      + size_extra_buffer1
+      + size_extra_buffer2
+      + size_extra_buffer3
+      + size_extra_buffer4
      + size_st_digests
      + size_st_salts
      + size_st_esalts
@ -11380,10 +11448,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_shown,  size_shown)              == -1) return -1;
      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_salt_bufs,      size_salts)              == -1) return -1;
      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_result,         size_results)            == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf,     size_extra_buffer1)      == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf,     size_extra_buffer2)      == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf,     size_extra_buffer3)      == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf,     size_extra_buffer4)      == -1) return -1;
      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_digests_buf, size_st_digests)         == -1) return -1;
      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf,   size_st_salts)           == -1) return -1;
      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_kernel_param,   size_kernel_params)      == -1) return -1;
@ -11491,10 +11559,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_digests_shown,  size_shown)              == -1) return -1;
      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_salt_bufs,      size_salts)              == -1) return -1;
      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_result,         size_results)            == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf,     size_extra_buffer1)      == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf,     size_extra_buffer2)      == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf,     size_extra_buffer3)      == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf,     size_extra_buffer4)      == -1) return -1;
      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_digests_buf, size_st_digests)         == -1) return -1;
      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_salts_buf,   size_st_salts)           == -1) return -1;
      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_kernel_param,   size_kernel_params)      == -1) return -1;
@ -11614,10 +11682,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

      // shared
      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_results,            NULL, &device_param->metal_d_result)         == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra0_buf)     == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra1_buf)     == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra2_buf)     == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra3_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer1,      NULL, &device_param->metal_d_extra0_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer2,      NULL, &device_param->metal_d_extra1_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer3,      NULL, &device_param->metal_d_extra2_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer4,      NULL, &device_param->metal_d_extra3_buf)     == -1) return -1;

      // gpu only
      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_st_digests,         NULL, &device_param->metal_d_st_digests_buf) == -1) return -1;
@ -11712,10 +11780,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_shown,              NULL, &device_param->opencl_d_digests_shown)  == -1) return -1;
      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_salts,              NULL, &device_param->opencl_d_salt_bufs)      == -1) return -1;
      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_results,            NULL, &device_param->opencl_d_result)         == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra0_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra1_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra2_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra3_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer1,      NULL, &device_param->opencl_d_extra0_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer2,      NULL, &device_param->opencl_d_extra1_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer3,      NULL, &device_param->opencl_d_extra2_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer4,      NULL, &device_param->opencl_d_extra3_buf)     == -1) return -1;
      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_digests,         NULL, &device_param->opencl_d_st_digests_buf) == -1) return -1;
      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->opencl_d_st_salts_buf)   == -1) return -1;
      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_kernel_params,      NULL, &device_param->opencl_d_kernel_param)   == -1) return -1;
@ -15734,7 +15802,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
        if (size_rules              > undocumented_single_allocation_apple) memory_limit_hit = 1;
        if (size_rules_c            > undocumented_single_allocation_apple) memory_limit_hit = 1;
        if (size_salts              > undocumented_single_allocation_apple) memory_limit_hit = 1;
-        if ((size_extra_buffer / 4) > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer1      > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer2      > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer3      > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer4      > undocumented_single_allocation_apple) memory_limit_hit = 1;
        if (size_shown              > undocumented_single_allocation_apple) memory_limit_hit = 1;
        if (size_tm                 > undocumented_single_allocation_apple) memory_limit_hit = 1;
        if (size_tmps               > undocumented_single_allocation_apple) memory_limit_hit = 1;
@ -15769,7 +15840,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
        + size_rules
        + size_rules_c
        + size_salts
-        + size_extra_buffer
+        + size_extra_buffer1
+        + size_extra_buffer2
+        + size_extra_buffer3
+        + size_extra_buffer4
        + size_shown
        + size_tm
        + size_tmps
--- a/src/modules/scrypt_common.c
+++ b/src/modules/scrypt_common.c
@ -46,22 +46,17 @@ u32 scrypt_exptected_threads (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  return threads;
 }

-const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
+const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
 {
  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];

-  // preprocess tmto in case user has overridden
-  // it's important to set to 0 otherwise so we can postprocess tmto in that case
-
-  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
-
  // we enforce the same configuration for all hashes, so the next lines should be fine

  const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N == 0) ? hashes->st_salts_buf[0].scrypt_N : hashes->salts_buf[0].scrypt_N;
  const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r == 0) ? hashes->st_salts_buf[0].scrypt_r : hashes->salts_buf[0].scrypt_r;
  const u32 scrypt_p = (hashes->salts_buf[0].scrypt_p == 0) ? hashes->st_salts_buf[0].scrypt_p : hashes->salts_buf[0].scrypt_p;

-  const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)) >> tmto;
+  const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param));
  const u64 state_per_accel = (128ULL * scrypt_r * scrypt_p * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param));

  int   lines_sz  = 4096;
@ -72,95 +67,144 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t

  const u32 device_local_mem_size = device_param->device_local_mem_size;

-  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+  const u64 fixed_mem = (512 * 1024 * 1024); // some storage we need for pws[], tmps[], and others
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - fixed_mem;
+
+  tmto = 0;

  u32 kernel_accel_new = device_processors;

-  if (kernel_accel)
+  if (kernel_accel_user)
  {
-    // from command line or tuning db has priority
+    kernel_accel_new = kernel_accel_user;

-    kernel_accel_new = user_options->kernel_accel;
-  }
-  else
-  {
-    // find a nice kernel_accel for gpus programmatically
-    // on cpus there's no need for over subscription with scrypt
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    if (user_options->scrypt_tmto_chgd == true)
    {
-      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      // in this branch the user can shoot themselves into the foot
+
+      tmto = user_options->scrypt_tmto;
+    }
+    else
+    {
+      // only option to save the user is to increase tmto
+
+      for (tmto = 0; tmto < 6; tmto++)
      {
-        const float multi = (float) available_mem / size_per_accel;
+        const u64 size_per_accel_tmto = size_per_accel >> tmto;

-        if ((multi * 2) >= device_processors)
-        {
-          kernel_accel_new  = multi * 2;
-          kernel_accel_new -= 2;
+        if ((size_per_accel_tmto * kernel_accel_new) > available_mem) continue; // not enough memory

-          if ((multi * 4) >= device_processors * 2)
-          {
-            kernel_accel_new  = multi * 4;
-            kernel_accel_new -= 4;
-          }
-        }
-        else if ((multi * 4) >= device_processors)
-        {
-          kernel_accel_new  = multi * 4;
-          kernel_accel_new -= 4;
-        }
-
-        // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
-
-        if (kernel_accel_new > device_processors)
-        {
-          const u32 extra = kernel_accel_new % device_processors;
-
-          if (extra < (device_processors * 0.16))
-          {
-            kernel_accel_new -= extra;
-          }
-        }
-      }
-      else
-      {
-        u64 multi = available_mem / size_per_accel;
-
-        if (tmto == 0)
-        {
-          tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto
-
-          multi = available_mem / (size_per_accel >> tmto);
-        }
-
-        multi /= device_processors;
-        multi -= 4;
-        multi  = MIN (16, multi);
-
-        kernel_accel_new = device_processors * multi; // should be safe because of tmto
+        break;
      }
    }
  }
-
-  // fix tmto if user allows
-
-  if (tmto == 0)
+  else
  {
-    const u32 tmto_start = 0;
-    const u32 tmto_stop  = 5;
-
-    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    if (user_options->scrypt_tmto_chgd == true)
    {
-      // global memory check
-      if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue;
+      tmto = user_options->scrypt_tmto;
+    }
+    else
+    {
+      // This is the typical case and the main challenge: choosing the right TMTO value.
+      // Finding a consistently good algorithm is nearly impossible due to the many factors
+      // that influence performance. There is no clear rule of thumb.
+      // 
+      // For example, consider the default scrypt configuration with N=16k and r=8.
+      //
+      // In one test with an NVIDIA mobile GPU with 16 GiB of memory (minus X), the device could
+      // use 28/58 processors. In theory, increasing the TMTO should increase
+      // performance, but in practice it had no effect at all.
+      //
+      // In another test with an NVIDIA discrete GPU with 11 GiB (minus X), the device initially
+      // used 19/68 processors. Increasing the TMTO to utilize all 68 processors
+      // did yield the expected performance improvement, matching the theory.
+      //
+      // However, with an AMD discrete GPU with 24 GiB (minus X), the optimal case used 46/48
+      // processors. Increasing the TMTO should have reduced performance, but
+      // instead it nearly doubled the speed?! This might be related to AMD GPUs performing
+      // best with a thread count of 64 instead of 32, but in practice, using 64 threads
+      // shows little difference compared to 32, suggesting that at a very low level,
+      // only 32 threads may actually be active.
+      //
+      // This algorithm is far from ideal. Fortunately, we have a tuning database,
+      // so users can find the best -n value for their specific setup, and a forced -n value
+      // allows to easily calculate the TMTO.

-      // also need local memory check because in kernel we have:
-      // LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; // 32 * 128 * r * p = 32KiB we're close if there's no TMTO
-      if (device_local_mem_size < (state_per_accel >> tmto_new)) continue;
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+      {
+        for (tmto = 0; tmto < 2; tmto++) // results in tmto = 2
+        {
+          if (device_param->device_host_unified_memory == 1) break; // do not touch

-      tmto = tmto_new;
+          if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
+           || (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
+          {
+            if (tmto == 0) continue; // at least 1
+          }

-      break;
+          const u64 size_per_accel_tmto = size_per_accel >> tmto;
+
+          const float blocks = (float) available_mem / size_per_accel_tmto;
+
+          const float blocks_perc = device_processors / blocks;
+
+          if (blocks_perc > 1.16) continue;
+
+          // probably very low scrypt configuration = register pressure becomes a bottleneck
+          if ((blocks_perc * (1 << tmto)) < 0.4) continue;
+
+          break;
+        }
+
+        if (device_param->is_hip == true)
+        {
+          // we use some local memory to speed up things, so 
+          // we need to make sure there's enough local memory available
+
+          u64 state_per_accel_tmto = state_per_accel >> tmto;
+
+          while (state_per_accel_tmto > device_local_mem_size)
+          {
+            tmto++;
+
+            state_per_accel_tmto = state_per_accel >> tmto;
+          }
+        }
+      }
+    }
+
+    // from here tmto is known, and we need to update kernel_accel
+
+    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
+    {
+      const u64 size_per_accel_tmto = size_per_accel >> tmto;
+
+      kernel_accel_new = available_mem / size_per_accel_tmto;
+
+      kernel_accel_new = MIN (kernel_accel_new, 1024); // max supported
+
+      // luxury option, clamp if we have twice the processors
+
+      if (kernel_accel_new > (device_processors * 2))
+      {
+        const u32 extra = kernel_accel_new % device_processors;
+
+        kernel_accel_new -= extra;
+      }
+
+      // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
+
+      if (kernel_accel_new > device_processors)
+      {
+        const u32 extra = kernel_accel_new % device_processors;
+
+        if (extra < (device_processors * 0.16))
+        {
+          kernel_accel_new -= extra;
+        }
+      }
    }
  }

@ -189,9 +233,11 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig

  const u64 size_per_accel = 128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param);

-  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
+  const u64 size_per_accel_tmto = size_per_accel >> tmto;

-  return size_scrypt / (1 << tmto);
+  const u64 size_scrypt = device_param->kernel_accel_max * size_per_accel_tmto;
+
+  return size_scrypt;
 }

 u64 scrypt_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -256,3 +302,4 @@ char *scrypt_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconf

  return jit_build_options;
 }
+