diff --git a/OpenCL/inc_comp_multi_iba.cl b/OpenCL/inc_comp_multi_iba.cl
index 9a41cc17c..05d8bf3b1 100644
--- a/OpenCL/inc_comp_multi_iba.cl
+++ b/OpenCL/inc_comp_multi_iba.cl
@@ -3,7 +3,7 @@ for (int digest_pos = 0; digest_pos < DIGESTS_CNT; digest_pos++)
 {
   const u32 final_hash_pos = DIGESTS_OFFSET_HOST + digest_pos;
 
-  const digest_t *digest = digests_buf + final_hash_pos;
+  GLOBAL_AS const digest_t *digest = digests_buf + final_hash_pos;
 
   const int invalid_bits = count_bits_32 (digest->digest_buf[0], r0)
                          + count_bits_32 (digest->digest_buf[1], r1)
diff --git a/OpenCL/inc_hash_scrypt.cl b/OpenCL/inc_hash_scrypt.cl
index 9662e330c..6655e2191 100644
--- a/OpenCL/inc_hash_scrypt.cl
+++ b/OpenCL/inc_hash_scrypt.cl
@@ -294,10 +294,12 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
   const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO;
   const u32 zSIZE = STATE_CNT4;
 
-  const u32 x = (u32) gid;
+  const u64 bid = get_group_id(0);
+  const u64 lsz = get_local_size(0);
+  const u64 lid = get_local_id(0);
 
-  const u32 xd4 = x / 4;
-  const u32 xm4 = x & 3;
+  const u32 xd4 = bid / 4;
+  const u32 xm4 = bid & 3;
 
   GLOBAL_AS uint4 *V;
 
@@ -311,7 +313,7 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
 
   for (u32 y = 0; y < ySIZE; y++)
   {
-    for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
+    for (u32 z = 0; z < zSIZE; z++) V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)] = X[z];
 
     #ifdef IS_HIP
     for (u32 i = 0; i < (1 << SCRYPT_TMTO); i++) salsa_r_l ((LOCAL_AS u32 *) X);
@@ -330,10 +332,12 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
   const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO;
   const u32 zSIZE = STATE_CNT4;
 
-  const u32 x = (u32) gid;
+  const u64 bid = get_group_id(0);
+  const u64 lsz = get_local_size(0);
+  const u64 lid = get_local_id(0);
 
-  const u32 xd4 = x / 4;
-  const u32 xm4 = x & 3;
+  const u32 xd4 = bid / 4;
+  const u32 xm4 = bid & 3;
 
   GLOBAL_AS uint4 *V;
 
@@ -347,7 +351,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
 
   // note: max 2048 iterations = forced -u 2048
 
-  const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048;
+  const u32 N_max = (SCRYPT_N < 2048) ? SCRYPT_N : 2048;
 
   for (u32 N_pos = 0; N_pos < N_max; N_pos++)
   {
@@ -357,7 +361,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
 
     const u32 km = k - (y << SCRYPT_TMTO);
 
-    for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO];
+    for (u32 z = 0; z < zSIZE; z++) T[z] = V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)];
 
     #ifdef IS_HIP
     for (u32 i = 0; i < km; i++) salsa_r_l ((LOCAL_AS u32 *) T);
diff --git a/OpenCL/inc_hash_scrypt.h b/OpenCL/inc_hash_scrypt.h
index a41b09def..6f3864cdd 100644
--- a/OpenCL/inc_hash_scrypt.h
+++ b/OpenCL/inc_hash_scrypt.h
@@ -15,8 +15,7 @@
 #define STATE_CNT   GET_STATE_CNT  (SCRYPT_R)
 #define STATE_CNT4  (STATE_CNT / 4)
 
-#define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
-#define CO Coord(xd4,y,z)
+#define VIDX(bid4,lsz,lid,ySIZE,zSIZE,y,z) (((bid4) * (lsz) * (ySIZE) * (zSIZE)) + ((lid) * (ySIZE) * (zSIZE)) + ((y) * (zSIZE)) + (z))
 
 #if defined IS_CUDA
 inline __device__ uint4 operator &  (const uint4  a, const u32   b) { return make_uint4 ((a.x &  b  ), (a.y &  b  ), (a.z &  b  ), (a.w &  b  ));  }
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 99d2cea13..1125236dd 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -104,6 +104,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
   return atomicOr (p, val);
 }
 
+DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+{
+  return blockIdx.x;
+}
+
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
 {
   return (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -280,6 +285,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
   return atomicOr (p, val);
 }
 
+DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+{
+  return blockIdx.x;
+}
+
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
 {
   return (blockIdx.x * blockDim.x) + threadIdx.x;
diff --git a/src/backend.c b/src/backend.c
index 4a7e36be3..9b1f290a5 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -5305,6 +5305,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       hc_string_trim_trailing (device_name);
 
+      // unified memory
+
+      int device_host_unified_memory = 0;
+
+      if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      device_param->device_host_unified_memory = device_host_unified_memory;
+
       // device_processors
 
       int device_processors = 0;
@@ -5735,6 +5748,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       hc_string_trim_trailing (device_name);
 
+      // unified memory
+
+      int device_host_unified_memory = 0;
+
+      if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, hipDeviceAttributeIntegrated, hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      device_param->device_host_unified_memory = device_host_unified_memory;
+
       // device_processors
 
       int device_processors = 0;
@@ -9705,11 +9731,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
       u32 _kernel_accel = 0;
 
-      tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+      if (user_options->kernel_accel_chgd == true)
+      {
+        _kernel_accel = user_options->kernel_accel;
+      } 
+      else
+      {
+        tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
 
-      if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
-
-      if (user_options->kernel_accel_chgd == true) _kernel_accel = user_options->kernel_accel;
+        if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
+      }
 
       const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes, device_id, _kernel_accel);
 
@@ -10405,7 +10436,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     // extra buffer
 
-    u64 size_extra_buffer = 4;
+    u64 size_extra_buffer1 = 4;
+    u64 size_extra_buffer2 = 4;
+    u64 size_extra_buffer3 = 4;
+    u64 size_extra_buffer4 = 4;
 
     if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT)
     {
@@ -10423,20 +10457,51 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
       device_param->extra_buffer_size = extra_buffer_size;
 
-      // for the size we actually allocate we need to cheat a bit in order to make it more easy for plugin developer.
-      //
-      // we will divide this size by 4 to workaround opencl limitation.
-      // this collides with a theoretical scenario (like -n1 -T1) where there's only one workitem,
-      // because inside the kernel the target buffer is selected by workitem_id / 4.
-      // but the maximum size of the buffer would be only 1/4 of what is needed -> overflow.
-      //
-      // to workaround this we make sure that there's always a full buffer in each of the 4 allocated buffers available.
+      /**
+       * We use a "4-buffer" strategy for certain hash types (like scrypt)
+       * that require large scratch buffers per work-item.
+       *
+       * The kernel assigns each work-item to one of 4 sub-buffers using:
+       *   buffer index = workitem_id % 4
+       *
+       * This means that each of the 4 sub-buffers must be large enough to hold
+       * all work-items that map to it. However, the total number of work-items
+       * is not always a multiple of 4. If we naively split the total buffer size
+       * evenly into 4 parts, the last chunk may be too small and cause buffer
+       * overflows for configurations where work-items spill into a partially sized chunk.
+       *
+       * Previous versions worked around this by over-allocating a full extra buffer,
+       * but this wasted gpu memory for large hashes like scrypt with high N.
+       *
+       * This improved logic computes the exact number of work-items assigned to
+       * each of the 4 chunks and sizes each chunk precisely:
+       *
+       * - The first 'leftover' chunks get one extra work-item to cover any remainder.
+       * - This guarantees each chunk is large enough for its assigned work-items.
+       */
 
-      const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+      const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_accel_max;
 
-      const u64 extra_buffer_size_one = extra_buffer_size / kernel_power_max;
+      const u64 extra_buffer_size_threads = extra_buffer_size / kernel_power_max;
 
-      size_extra_buffer = extra_buffer_size + (extra_buffer_size_one * 4);
+      const u64 workitems_per_chunk = kernel_power_max / 4;
+
+      const u64 base_chunk_size = workitems_per_chunk * extra_buffer_size_threads;
+
+      size_extra_buffer1 = base_chunk_size;
+      size_extra_buffer2 = base_chunk_size;
+      size_extra_buffer3 = base_chunk_size;
+      size_extra_buffer4 = base_chunk_size;
+
+      const u64 leftover = kernel_power_max % 4;
+
+      switch (leftover)
+      {
+        case 3: size_extra_buffer3 += extra_buffer_size_threads; // fall-through
+        case 2: size_extra_buffer2 += extra_buffer_size_threads; // fall-through
+        case 1: size_extra_buffer1 += extra_buffer_size_threads; // fall-through
+        case 0: break;
+      }
     }
 
     // kern type
@@ -11343,7 +11408,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       + size_shown
       + size_salts
       + size_results
-      + size_extra_buffer
+      + size_extra_buffer1
+      + size_extra_buffer2
+      + size_extra_buffer3
+      + size_extra_buffer4
       + size_st_digests
       + size_st_salts
       + size_st_esalts
@@ -11380,10 +11448,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_shown,  size_shown)              == -1) return -1;
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_salt_bufs,      size_salts)              == -1) return -1;
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_result,         size_results)            == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf,     size_extra_buffer1)      == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf,     size_extra_buffer2)      == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf,     size_extra_buffer3)      == -1) return -1;
+      if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf,     size_extra_buffer4)      == -1) return -1;
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_digests_buf, size_st_digests)         == -1) return -1;
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf,   size_st_salts)           == -1) return -1;
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_kernel_param,   size_kernel_params)      == -1) return -1;
@@ -11491,10 +11559,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_digests_shown,  size_shown)              == -1) return -1;
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_salt_bufs,      size_salts)              == -1) return -1;
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_result,         size_results)            == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf,     size_extra_buffer / 4)   == -1) return -1;
-      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf,     size_extra_buffer / 4)   == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf,     size_extra_buffer1)      == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf,     size_extra_buffer2)      == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf,     size_extra_buffer3)      == -1) return -1;
+      if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf,     size_extra_buffer4)      == -1) return -1;
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_digests_buf, size_st_digests)         == -1) return -1;
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_salts_buf,   size_st_salts)           == -1) return -1;
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_kernel_param,   size_kernel_params)      == -1) return -1;
@@ -11614,10 +11682,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
       // shared
       if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_results,            NULL, &device_param->metal_d_result)         == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra0_buf)     == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra1_buf)     == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra2_buf)     == -1) return -1;
-      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4,   NULL, &device_param->metal_d_extra3_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer1,      NULL, &device_param->metal_d_extra0_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer2,      NULL, &device_param->metal_d_extra1_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer3,      NULL, &device_param->metal_d_extra2_buf)     == -1) return -1;
+      if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer4,      NULL, &device_param->metal_d_extra3_buf)     == -1) return -1;
 
       // gpu only
       if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_st_digests,         NULL, &device_param->metal_d_st_digests_buf) == -1) return -1;
@@ -11712,10 +11780,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_shown,              NULL, &device_param->opencl_d_digests_shown)  == -1) return -1;
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_salts,              NULL, &device_param->opencl_d_salt_bufs)      == -1) return -1;
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_results,            NULL, &device_param->opencl_d_result)         == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra0_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra1_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra2_buf)     == -1) return -1;
-      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer / 4,   NULL, &device_param->opencl_d_extra3_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer1,      NULL, &device_param->opencl_d_extra0_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer2,      NULL, &device_param->opencl_d_extra1_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer3,      NULL, &device_param->opencl_d_extra2_buf)     == -1) return -1;
+      if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE,  size_extra_buffer4,      NULL, &device_param->opencl_d_extra3_buf)     == -1) return -1;
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_digests,         NULL, &device_param->opencl_d_st_digests_buf) == -1) return -1;
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->opencl_d_st_salts_buf)   == -1) return -1;
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_kernel_params,      NULL, &device_param->opencl_d_kernel_param)   == -1) return -1;
@@ -15734,7 +15802,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         if (size_rules              > undocumented_single_allocation_apple) memory_limit_hit = 1;
         if (size_rules_c            > undocumented_single_allocation_apple) memory_limit_hit = 1;
         if (size_salts              > undocumented_single_allocation_apple) memory_limit_hit = 1;
-        if ((size_extra_buffer / 4) > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer1      > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer2      > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer3      > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer4      > undocumented_single_allocation_apple) memory_limit_hit = 1;
         if (size_shown              > undocumented_single_allocation_apple) memory_limit_hit = 1;
         if (size_tm                 > undocumented_single_allocation_apple) memory_limit_hit = 1;
         if (size_tmps               > undocumented_single_allocation_apple) memory_limit_hit = 1;
@@ -15769,7 +15840,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         + size_rules
         + size_rules_c
         + size_salts
-        + size_extra_buffer
+        + size_extra_buffer1
+        + size_extra_buffer2
+        + size_extra_buffer3
+        + size_extra_buffer4
         + size_shown
         + size_tm
         + size_tmps
diff --git a/src/modules/scrypt_common.c b/src/modules/scrypt_common.c
index 059f2fca8..971683ae5 100644
--- a/src/modules/scrypt_common.c
+++ b/src/modules/scrypt_common.c
@@ -46,22 +46,17 @@ u32 scrypt_exptected_threads (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   return threads;
 }
 
-const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
+const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
 {
   hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
 
-  // preprocess tmto in case user has overridden
-  // it's important to set to 0 otherwise so we can postprocess tmto in that case
-
-  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
-
   // we enforce the same configuration for all hashes, so the next lines should be fine
 
   const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N == 0) ? hashes->st_salts_buf[0].scrypt_N : hashes->salts_buf[0].scrypt_N;
   const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r == 0) ? hashes->st_salts_buf[0].scrypt_r : hashes->salts_buf[0].scrypt_r;
   const u32 scrypt_p = (hashes->salts_buf[0].scrypt_p == 0) ? hashes->st_salts_buf[0].scrypt_p : hashes->salts_buf[0].scrypt_p;
 
-  const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)) >> tmto;
+  const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param));
   const u64 state_per_accel = (128ULL * scrypt_r * scrypt_p * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param));
 
   int   lines_sz  = 4096;
@@ -72,95 +67,144 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
 
   const u32 device_local_mem_size = device_param->device_local_mem_size;
 
-  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+  const u64 fixed_mem = (512 * 1024 * 1024); // some storage we need for pws[], tmps[], and others
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - fixed_mem;
+
+  tmto = 0;
 
   u32 kernel_accel_new = device_processors;
 
-  if (kernel_accel)
+  if (kernel_accel_user)
   {
-    // from command line or tuning db has priority
+    kernel_accel_new = kernel_accel_user;
 
-    kernel_accel_new = user_options->kernel_accel;
-  }
-  else
-  {
-    // find a nice kernel_accel for gpus programmatically
-    // on cpus there's no need for over subscription with scrypt
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    if (user_options->scrypt_tmto_chgd == true)
     {
-      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      // in this branch the user can shoot themselves into the foot
+
+      tmto = user_options->scrypt_tmto;
+    }
+    else
+    {
+      // only option to save the user is to increase tmto
+
+      for (tmto = 0; tmto < 6; tmto++)
       {
-        const float multi = (float) available_mem / size_per_accel;
+        const u64 size_per_accel_tmto = size_per_accel >> tmto;
 
-        if ((multi * 2) >= device_processors)
-        {
-          kernel_accel_new  = multi * 2;
-          kernel_accel_new -= 2;
+        if ((size_per_accel_tmto * kernel_accel_new) > available_mem) continue; // not enough memory
 
-          if ((multi * 4) >= device_processors * 2)
-          {
-            kernel_accel_new  = multi * 4;
-            kernel_accel_new -= 4;
-          }
-        }
-        else if ((multi * 4) >= device_processors)
-        {
-          kernel_accel_new  = multi * 4;
-          kernel_accel_new -= 4;
-        }
-
-        // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
-
-        if (kernel_accel_new > device_processors)
-        {
-          const u32 extra = kernel_accel_new % device_processors;
-
-          if (extra < (device_processors * 0.16))
-          {
-            kernel_accel_new -= extra;
-          }
-        }
-      }
-      else
-      {
-        u64 multi = available_mem / size_per_accel;
-
-        if (tmto == 0)
-        {
-          tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto
-
-          multi = available_mem / (size_per_accel >> tmto);
-        }
-
-        multi /= device_processors;
-        multi -= 4;
-        multi  = MIN (16, multi);
-
-        kernel_accel_new = device_processors * multi; // should be safe because of tmto
+        break;
       }
     }
   }
-
-  // fix tmto if user allows
-
-  if (tmto == 0)
+  else
   {
-    const u32 tmto_start = 0;
-    const u32 tmto_stop  = 5;
-
-    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    if (user_options->scrypt_tmto_chgd == true)
     {
-      // global memory check
-      if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue;
+      tmto = user_options->scrypt_tmto;
+    }
+    else
+    {
+      // This is the typical case and the main challenge: choosing the right TMTO value.
+      // Finding a consistently good algorithm is nearly impossible due to the many factors
+      // that influence performance. There is no clear rule of thumb.
+      // 
+      // For example, consider the default scrypt configuration with N=16k and r=8.
+      //
+      // In one test with an NVIDIA mobile GPU with 16 GiB of memory (minus X), the device could
+      // use 28/58 processors. In theory, increasing the TMTO should increase
+      // performance, but in practice it had no effect at all.
+      //
+      // In another test with an NVIDIA discrete GPU with 11 GiB (minus X), the device initially
+      // used 19/68 processors. Increasing the TMTO to utilize all 68 processors
+      // did yield the expected performance improvement, matching the theory.
+      //
+      // However, with an AMD discrete GPU with 24 GiB (minus X), the optimal case used 46/48
+      // processors. Increasing the TMTO should have reduced performance, but
+      // instead it nearly doubled the speed?! This might be related to AMD GPUs performing
+      // best with a thread count of 64 instead of 32, but in practice, using 64 threads
+      // shows little difference compared to 32, suggesting that at a very low level,
+      // only 32 threads may actually be active.
+      //
+      // This algorithm is far from ideal. Fortunately, we have a tuning database,
+      // so users can find the best -n value for their specific setup, and a forced -n value
+      // allows to easily calculate the TMTO.
 
-      // also need local memory check because in kernel we have:
-      // LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; // 32 * 128 * r * p = 32KiB we're close if there's no TMTO
-      if (device_local_mem_size < (state_per_accel >> tmto_new)) continue;
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+      {
+        for (tmto = 0; tmto < 2; tmto++) // results in tmto = 2
+        {
+          if (device_param->device_host_unified_memory == 1) break; // do not touch
 
-      tmto = tmto_new;
+          if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
+           || (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
+          {
+            if (tmto == 0) continue; // at least 1
+          }
 
-      break;
+          const u64 size_per_accel_tmto = size_per_accel >> tmto;
+
+          const float blocks = (float) available_mem / size_per_accel_tmto;
+
+          const float blocks_perc = device_processors / blocks;
+
+          if (blocks_perc > 1.16) continue;
+
+          // probably very low scrypt configuration = register pressure becomes a bottleneck
+          if ((blocks_perc * (1 << tmto)) < 0.4) continue;
+
+          break;
+        }
+
+        if (device_param->is_hip == true)
+        {
+          // we use some local memory to speed up things, so 
+          // we need to make sure there's enough local memory available
+
+          u64 state_per_accel_tmto = state_per_accel >> tmto;
+
+          while (state_per_accel_tmto > device_local_mem_size)
+          {
+            tmto++;
+
+            state_per_accel_tmto = state_per_accel >> tmto;
+          }
+        }
+      }
+    }
+
+    // from here tmto is known, and we need to update kernel_accel
+
+    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
+    {
+      const u64 size_per_accel_tmto = size_per_accel >> tmto;
+
+      kernel_accel_new = available_mem / size_per_accel_tmto;
+
+      kernel_accel_new = MIN (kernel_accel_new, 1024); // max supported
+
+      // luxury option, clamp if we have twice the processors
+
+      if (kernel_accel_new > (device_processors * 2))
+      {
+        const u32 extra = kernel_accel_new % device_processors;
+
+        kernel_accel_new -= extra;
+      }
+
+      // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
+
+      if (kernel_accel_new > device_processors)
+      {
+        const u32 extra = kernel_accel_new % device_processors;
+
+        if (extra < (device_processors * 0.16))
+        {
+          kernel_accel_new -= extra;
+        }
+      }
     }
   }
 
@@ -189,9 +233,11 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig
 
   const u64 size_per_accel = 128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param);
 
-  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
+  const u64 size_per_accel_tmto = size_per_accel >> tmto;
 
-  return size_scrypt / (1 << tmto);
+  const u64 size_scrypt = device_param->kernel_accel_max * size_per_accel_tmto;
+
+  return size_scrypt;
 }
 
 u64 scrypt_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -256,3 +302,4 @@ char *scrypt_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconf
 
   return jit_build_options;
 }
+