diff --git a/include/ext_nvml.h b/include/ext_nvml.h
index 02c5d490c..0215e1a32 100644
--- a/include/ext_nvml.h
+++ b/include/ext_nvml.h
@@ -161,6 +161,18 @@ typedef enum nvmlGom_enum
  * */
 #define nvmlClocksThrottleReasonNone                      0x0000000000000000LL
 
+/**
+ * Memory allocation information for a device (v1).
+ * The total amount is equal to the sum of the amounts of free and used memory.
+ */
+typedef struct nvmlMemory_st
+{
+    unsigned long long total;        //!< Total physical device memory (in bytes)
+    unsigned long long free;         //!< Unallocated device memory (in bytes)
+    unsigned long long used;         //!< Sum of Reserved and Allocated device memory (in bytes).
+                                     //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+
 /*
  * End of declarations from nvml.h
  **/
@@ -191,6 +203,7 @@ typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREAS
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_SET_COMPUTEMODE) (nvmlDevice_t, nvmlComputeMode_t);
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_SET_OPERATIONMODE) (nvmlDevice_t, nvmlGpuOperationMode_t);
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_PCIINFO) (nvmlDevice_t, nvmlPciInfo_t *);
+typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_MEMORYINFO) (nvmlDevice_t, nvmlMemory_t *);
 
 typedef struct hm_nvml_lib
 {
@@ -212,6 +225,7 @@ typedef struct hm_nvml_lib
   NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS nvmlDeviceGetCurrentClocksThrottleReasons;
   NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS nvmlDeviceGetSupportedClocksThrottleReasons;
   NVML_DEVICE_GET_PCIINFO nvmlDeviceGetPciInfo;
+  NVML_DEVICE_GET_MEMORYINFO nvmlDeviceGetMemoryInfo;
 
 } hm_nvml_lib_t;
 
@@ -232,5 +246,6 @@ int hm_NVML_nvmlDeviceGetClockInfo (void *hashcat_ctx, nvmlDevice_t device, nvml
 int hm_NVML_nvmlDeviceGetTemperatureThreshold (void *hashcat_ctx, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
 int hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (void *hashcat_ctx, nvmlDevice_t device, unsigned int *currLinkWidth);
 int hm_NVML_nvmlDeviceGetPciInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlPciInfo_t *pci);
+int hm_NVML_nvmlDeviceGetMemoryInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlMemory_t *mem);
 
 #endif // HC_NVML_H
diff --git a/include/ext_sysfs_amdgpu.h b/include/ext_sysfs_amdgpu.h
index 50c0dc569..d381d9cec 100644
--- a/include/ext_sysfs_amdgpu.h
+++ b/include/ext_sysfs_amdgpu.h
@@ -34,5 +34,6 @@ int hm_SYSFS_AMDGPU_get_pp_dpm_sclk (void *hashcat_ctx, const int backend_device
 int hm_SYSFS_AMDGPU_get_pp_dpm_mclk (void *hashcat_ctx, const int backend_device_idx, int *val);
 int hm_SYSFS_AMDGPU_get_pp_dpm_pcie (void *hashcat_ctx, const int backend_device_idx, int *val);
 int hm_SYSFS_AMDGPU_get_gpu_busy_percent (void *hashcat_ctx, const int backend_device_idx, int *val);
+int hm_SYSFS_AMDGPU_get_mem_info_vram_used (void *hashcat_ctx, const int backend_device_idx, u64 *val);
 
 #endif // HC_EXT_SYSFS_AMDGPU_H
diff --git a/include/hwmon.h b/include/hwmon.h
index 545e22b2d..3d4bd7940 100644
--- a/include/hwmon.h
+++ b/include/hwmon.h
@@ -24,6 +24,7 @@ int hm_get_utilization_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, cons
 int hm_get_memoryspeed_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 int hm_get_corespeed_with_devices_idx          (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 int hm_get_throttle_with_devices_idx           (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+u64 hm_get_memoryused_with_devices_idx         (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 
 int  hwmon_ctx_init    (hashcat_ctx_t *hashcat_ctx);
 void hwmon_ctx_destroy (hashcat_ctx_t *hashcat_ctx);
diff --git a/include/modules.h b/include/modules.h
index aed8403ce..713b3f46f 100644
--- a/include/modules.h
+++ b/include/modules.h
@@ -20,7 +20,7 @@ u32         module_dgst_pos2                (MAYBE_UNUSED const hashconfig_t *ha
 u32         module_dgst_pos3                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u32         module_dgst_size                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u64         module_esalt_size               (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
-const char *module_extra_tuningdb_block     (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes);
+const char *module_extra_tuningdb_block     (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel);
 u32         module_forced_outfile_format    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u32         module_hash_category            (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 const char *module_hash_name                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
diff --git a/include/tuningdb.h b/include/tuningdb.h
index 608a25cfd..b2cafff67 100644
--- a/include/tuningdb.h
+++ b/include/tuningdb.h
@@ -17,7 +17,7 @@ int sort_by_tuning_db_entry (const void *v1, const void *v2);
 int  tuning_db_init    (hashcat_ctx_t *hashcat_ctx);
 void tuning_db_destroy (hashcat_ctx_t *hashcat_ctx);
 
-bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num, const int source);
+bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num);
 tuning_db_entry_t *tuning_db_search (hashcat_ctx_t *hashcat_ctx, const char *device_name, const cl_device_type device_type, int attack_mode, const int hash_mode);
 
 #endif // HC_TUNINGDB_H
diff --git a/include/types.h b/include/types.h
index d02c1b783..8f265ab14 100644
--- a/include/types.h
+++ b/include/types.h
@@ -2067,6 +2067,7 @@ typedef struct hm_attrs
   bool threshold_slowdown_get_supported;
   bool throttle_get_supported;
   bool utilization_get_supported;
+  bool memoryused_get_supported;
 
 } hm_attrs_t;
 
@@ -3013,7 +3014,7 @@ typedef struct module_ctx
   u32         (*module_dgst_size)               (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   bool        (*module_dictstat_disable)        (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   u64         (*module_esalt_size)              (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
-  const char *(*module_extra_tuningdb_block)    (const hashconfig_t *, const user_options_t *, const user_options_extra_t *, const backend_ctx_t *, const hashes_t *);
+  const char *(*module_extra_tuningdb_block)    (const hashconfig_t *, const user_options_t *, const user_options_extra_t *, const backend_ctx_t *, const hashes_t *, const u32, const u32);
   u32         (*module_forced_outfile_format)   (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   u32         (*module_hash_category)           (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   const char *(*module_hash_name)               (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
diff --git a/src/backend.c b/src/backend.c
index c5b95a659..80d1bfd60 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -24,6 +24,7 @@
 #include "dynloader.h"
 #include "backend.h"
 #include "terminal.h"
+#include "hwmon.h"
 
 #if defined (__linux__)
 static const char *const  dri_card0_path = "/dev/dri/card0";
@@ -9649,7 +9650,44 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (module_ctx->module_extra_tuningdb_block != MODULE_DEFAULT)
     {
-      const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes);
+      // We need this because we can't trust CUDA/HIP to give us the real free device memory
+      // The only way to do so is through low level APIs
+
+      for (int i = 0; i < 10; i++)
+      {
+        const u64 used_bytes = hm_get_memoryused_with_devices_idx (hashcat_ctx, device_id);
+
+        if (used_bytes)
+        {
+          if ((used_bytes > (2ULL * 1024 * 1024 * 1024))
+           || (used_bytes > (device_param->device_global_mem * 0.5)))
+          {
+            event_log_warning (hashcat_ctx, "* Device #%u: Memory usage is too high: %" PRIu64 "/%" PRIu64 ", waiting...", device_id + 1, used_bytes, device_param->device_global_mem);
+
+            sleep (1);
+
+            continue;
+          }
+
+          device_param->device_available_mem -= used_bytes;
+
+          break;
+        }
+        else
+        {
+          break;
+        }
+      }
+
+      u32 _kernel_accel = 0;
+
+      tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+
+      if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
+
+      if (user_options->kernel_accel_chgd == true) _kernel_accel = user_options->kernel_accel;
+
+      const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes, device_id, _kernel_accel);
 
       char *lines_buf = hcstrdup (extra_tuningdb_block);
 
@@ -9669,7 +9707,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
         if (next[0] == '#') continue;
 
-        tuning_db_process_line (hashcat_ctx, next, line_num, 2);
+        tuning_db_process_line (hashcat_ctx, next, line_num);
 
       } while ((next = strtok_r ((char *) NULL, "\n", &saveptr)) != NULL);
 
diff --git a/src/ext_nvml.c b/src/ext_nvml.c
index 25911df14..e6d49cd08 100644
--- a/src/ext_nvml.c
+++ b/src/ext_nvml.c
@@ -149,6 +149,7 @@ int nvml_init (void *hashcat_ctx)
   HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrentClocksThrottleReasons, NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS, NVML, 0);
   HC_LOAD_FUNC(nvml, nvmlDeviceGetSupportedClocksThrottleReasons, NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS, NVML, 0);
   HC_LOAD_FUNC(nvml, nvmlDeviceGetPciInfo, NVML_DEVICE_GET_PCIINFO, NVML, 0);
+  HC_LOAD_FUNC(nvml, nvmlDeviceGetMemoryInfo, NVML_DEVICE_GET_MEMORYINFO, NVML, 0);
 
   return 0;
 }
@@ -392,3 +393,24 @@ int hm_NVML_nvmlDeviceGetPciInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlPc
 
   return 0;
 }
+
+int hm_NVML_nvmlDeviceGetMemoryInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlMemory_t *mem)
+{
+  hwmon_ctx_t *hwmon_ctx = ((hashcat_ctx_t *) hashcat_ctx)->hwmon_ctx;
+
+  NVML_PTR *nvml = (NVML_PTR *) hwmon_ctx->hm_nvml;
+
+  const nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetMemoryInfo (device, mem);
+
+  if (nvml_rc != NVML_SUCCESS)
+  {
+    const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+    event_log_error (hashcat_ctx, "nvmlDeviceGetMemoryInfo(): %s", string);
+
+    return -1;
+  }
+
+  return 0;
+}
+
diff --git a/src/ext_sysfs_amdgpu.c b/src/ext_sysfs_amdgpu.c
index 1aa53b210..70f071649 100644
--- a/src/ext_sysfs_amdgpu.c
+++ b/src/ext_sysfs_amdgpu.c
@@ -441,3 +441,55 @@ int hm_SYSFS_AMDGPU_get_gpu_busy_percent (void *hashcat_ctx, const int backend_d
 
   return 0;
 }
+
+int hm_SYSFS_AMDGPU_get_mem_info_vram_used (void *hashcat_ctx, const int backend_device_idx, u64 *val)
+{
+  char *syspath = hm_SYSFS_AMDGPU_get_syspath_device (hashcat_ctx, backend_device_idx);
+
+  if (syspath == NULL) return -1;
+
+  char *path;
+
+  hc_asprintf (&path, "%s/mem_info_vram_used", syspath);
+
+  hcfree (syspath);
+
+  HCFILE fp;
+
+  if (hc_fopen (&fp, path, "r") == false)
+  {
+    event_log_error (hashcat_ctx, "%s: %s", path, strerror (errno));
+
+    hcfree (path);
+
+    return -1;
+  }
+
+  u64 mem_info_vram_used = 0;
+
+  while (!hc_feof (&fp))
+  {
+    char buf[HCBUFSIZ_TINY];
+
+    char *ptr = hc_fgets (buf, sizeof (buf), &fp);
+
+    if (ptr == NULL) continue;
+
+    size_t len = strlen (ptr);
+
+    if (len < 1) continue;
+
+    int rc = sscanf (ptr, "%" PRIu64, &mem_info_vram_used);
+
+    if (rc == 1) break;
+  }
+
+  hc_fclose (&fp);
+
+  *val = mem_info_vram_used;
+
+  hcfree (path);
+
+  return 0;
+}
+
diff --git a/src/hwmon.c b/src/hwmon.c
index a0f24c644..4f5264b3d 100644
--- a/src/hwmon.c
+++ b/src/hwmon.c
@@ -1214,6 +1214,60 @@ int hm_get_throttle_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int back
   return -1;
 }
 
+u64 hm_get_memoryused_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
+{
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (hwmon_ctx->enabled == false) return 0;
+
+  if (hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported == false) return 0;
+
+  if ((backend_ctx->devices_param[backend_device_idx].is_opencl == true) || (backend_ctx->devices_param[backend_device_idx].is_hip == true) || (backend_ctx->devices_param[backend_device_idx].is_cuda == true))
+  {
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD) || (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
+      {
+        if (hwmon_ctx->hm_sysfs_amdgpu)
+        {
+          u64 used = 0;
+
+          if (hm_SYSFS_AMDGPU_get_mem_info_vram_used (hashcat_ctx, backend_device_idx, &used) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+            return 0;
+          }
+
+          return used;
+        }
+      }
+
+      if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+      {
+        if (hwmon_ctx->hm_nvml)
+        {
+          nvmlMemory_t mem;
+
+          if (hm_NVML_nvmlDeviceGetMemoryInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &mem) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+            return 0;
+          }
+
+          return mem.used;
+        }
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+  return 0;
+}
+
 int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 {
   bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
@@ -1227,12 +1281,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
 
-  #if !defined (WITH_HWMON)
-  return 0;
-  #endif // WITH_HWMON
+  //#if !defined (WITH_HWMON)
+  //return 0;
+  //#endif // WITH_HWMON
 
   if (user_options->usage          > 0)     return 0;
-  if (user_options->backend_info   > 0)     return 0;
+  //if (user_options->backend_info   > 0)     return 0;
 
   if (user_options->hash_info     == true)  return 0;
   if (user_options->keyspace      == true)  return 0;
@@ -1241,7 +1295,9 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->stdout_flag   == true)  return 0;
   if (user_options->version       == true)  return 0;
   if (user_options->identify      == true)  return 0;
-  if (user_options->hwmon         == false) return 0;
+  //we need hwmon support to get free memory per device support
+  //its a joke, but there's no way around
+  //if (user_options->hwmon         == false) return 0;
 
   hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
 
@@ -1387,6 +1443,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
               hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
               hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
               hm_adapters_nvml[device_id].utilization_get_supported         = true;
+              hm_adapters_nvml[device_id].memoryused_get_supported          = true;
             }
           }
         }
@@ -1419,6 +1476,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
               hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
               hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
               hm_adapters_nvml[device_id].utilization_get_supported         = true;
+              hm_adapters_nvml[device_id].memoryused_get_supported          = true;
             }
           }
         }
@@ -1640,6 +1698,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
             hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
             hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
             hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
+            hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
           }
         }
       }
@@ -1746,6 +1805,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
         hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
         hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
         hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_nvml[device_id].memoryused_get_supported;
       }
 
       if (hwmon_ctx->hm_nvapi)
@@ -1875,6 +1935,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
             hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_sysfs_amdgpu[device_id].threshold_slowdown_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_sysfs_amdgpu[device_id].throttle_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported;
+            hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported;
           }
         }
 
@@ -1895,6 +1956,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
             hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
+            hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_nvml[device_id].memoryused_get_supported;
           }
 
           if (hwmon_ctx->hm_nvapi)
@@ -1927,6 +1989,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     hm_get_threshold_slowdown_with_devices_idx (hashcat_ctx, backend_devices_idx);
     hm_get_throttle_with_devices_idx           (hashcat_ctx, backend_devices_idx);
     hm_get_utilization_with_devices_idx        (hashcat_ctx, backend_devices_idx);
+    hm_get_memoryused_with_devices_idx         (hashcat_ctx, backend_devices_idx);
   }
 
   FREE_ADAPTERS;
diff --git a/src/modules/module_08900.c b/src/modules/module_08900.c
index 0865e8575..42fd456be 100644
--- a/src/modules/module_08900.c
+++ b/src/modules/module_08900.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_SCRYPT = "SCRYPT";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -527,7 +464,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_09300.c b/src/modules/module_09300.c
index 8f92e7fce..4f0f5bbb5 100644
--- a/src/modules/module_09300.c
+++ b/src/modules/module_09300.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_CISCO9 = "$9$";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 1;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = req1 * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -488,7 +425,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_15700.c b/src/modules/module_15700.c
index c7a357dd0..063106d2f 100644
--- a/src/modules/module_15700.c
+++ b/src/modules/module_15700.c
@@ -56,6 +56,8 @@ typedef struct ethereum_scrypt
 
 static const char *SIGNATURE_ETHEREUM_SCRYPT = "$ethereum$s";
 
+static const u32 SCRYPT_THREADS = 4;
+
 static const u64 SCRYPT_N = 262144;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -74,9 +76,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 4;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -98,90 +107,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -193,115 +234,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -587,7 +524,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_22700.c b/src/modules/module_22700.c
index 30c106625..1b9113bd4 100644
--- a/src/modules/module_22700.c
+++ b/src/modules/module_22700.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_MULTIBIT = "$multibit$";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -526,7 +463,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_24000.c b/src/modules/module_24000.c
index 62217f2d8..159acbed0 100644
--- a/src/modules/module_24000.c
+++ b/src/modules/module_24000.c
@@ -57,27 +57,13 @@ typedef struct bestcrypt_scrypt
 // 16 is actually a bit low, we may need to change this depending on user response
 
 static const char *SIGNATURE_BESTCRYPT_SCRYPT = "$bcve$";
-static const u32   SCRYPT_MAX_ACCEL          = 256;
-static const u32   SCRYPT_MAX_THREADS        = 4;
+
+static const u32 SCRYPT_THREADS = 16;
 
 static const u64 SCRYPT_N = 32768;
 static const u64 SCRYPT_R = 16;
 static const u64 SCRYPT_P = 1;
 
-u32 module_kernel_accel_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_accel_min = 1;
-
-  return kernel_accel_min;
-}
-
-u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_accel_max = (user_options->kernel_accel_chgd == true) ? user_options->kernel_accel : SCRYPT_MAX_ACCEL;
-
-  return kernel_accel_max;
-}
-
 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_min = 1;
@@ -94,14 +80,14 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
 
 u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_min = 1;
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_min;
 }
 
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_MAX_THREADS;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -123,90 +109,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -215,121 +233,14 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   // we need to set the self-test hash settings to pass the self-test
   // the decoder for the self-test is called after this function
 
-  const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
-  const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
+  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
+  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u32 tmto_start = 1;
-  u32 tmto_stop  = 6;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-/*
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-*/
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-/*
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-*/
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks;
-//    + size_pws_pre
-//    + size_pws_base;
-/*
-    + EXTRA_SPACE;
-*/
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u32 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -593,8 +504,8 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_hook_size                = MODULE_DEFAULT;
   module_ctx->module_jit_build_options        = module_jit_build_options;
   module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
-  module_ctx->module_kernel_accel_max         = module_kernel_accel_max;
-  module_ctx->module_kernel_accel_min         = module_kernel_accel_min;
+  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
diff --git a/src/modules/module_27700.c b/src/modules/module_27700.c
index 089deb5fa..fb3a31fa1 100644
--- a/src/modules/module_27700.c
+++ b/src/modules/module_27700.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_MULTIBIT = "$multibit$";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -550,7 +487,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_28200.c b/src/modules/module_28200.c
index 86a636adf..52a7adbdd 100644
--- a/src/modules/module_28200.c
+++ b/src/modules/module_28200.c
@@ -57,6 +57,8 @@ typedef struct exodus
 
 static const char *SIGNATURE_EXODUS = "EXODUS";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -75,9 +77,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -96,90 +105,122 @@ u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED
   return esalt_size;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -191,115 +232,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -634,7 +571,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_29800.c b/src/modules/module_29800.c
index d1be6be39..633ef1978 100644
--- a/src/modules/module_29800.c
+++ b/src/modules/module_29800.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_BISQ = "$bisq$";
 
+static const u32 SCRYPT_THREADS = 16;
+
 static const u64 SCRYPT_N = 32768;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 6;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -91,90 +100,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -186,115 +227,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -557,7 +494,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/tuningdb.c b/src/tuningdb.c
index 406359ab4..1c5e6cb32 100644
--- a/src/tuningdb.c
+++ b/src/tuningdb.c
@@ -43,11 +43,6 @@ int sort_by_tuning_db_entry (const void *v1, const void *v2)
 
   if (res3 != 0) return (res3);
 
-  const int res4 = t1->source
-                 - t2->source;
-
-  if (res4 != 0) return (res4);
-
   return 0;
 }
 
@@ -118,7 +113,7 @@ int tuning_db_init (hashcat_ctx_t *hashcat_ctx)
 
       if (line_buf[0] == '#') continue;
 
-      tuning_db_process_line (hashcat_ctx, line_buf, line_num, 1);
+      tuning_db_process_line (hashcat_ctx, line_buf, line_num);
     }
 
     hcfree (buf);
@@ -167,7 +162,7 @@ void tuning_db_destroy (hashcat_ctx_t *hashcat_ctx)
   memset (tuning_db, 0, sizeof (tuning_db_t));
 }
 
-bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num, const int source)
+bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num)
 {
   tuning_db_t           *tuning_db          = hashcat_ctx->tuning_db;
   user_options_extra_t  *user_options_extra = hashcat_ctx->user_options_extra;
@@ -353,7 +348,6 @@ bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, c
     entry->vector_width = vector_width;
     entry->kernel_accel = kernel_accel;
     entry->kernel_loops = kernel_loops;
-    entry->source       = source;
 
     tuning_db->entry_cnt++;
   }
@@ -430,12 +424,11 @@ static tuning_db_entry_t *tuning_db_search_real (hashcat_ctx_t *hashcat_ctx, con
 
   // this will produce all 2^3 combinations required
 
-  for (i = 0; i < 16; i++)
+  for (i = 0; i < 8; i++)
   {
-    s.source      = (i & 1) ?   2 : 1;
+    s.device_name = (i & 1) ? "*" : device_name_nospace;
     s.attack_mode = (i & 2) ?  -1 : attack_mode;
     s.hash_mode   = (i & 4) ?  -1 : hash_mode;
-    s.device_name = (i & 8) ? "*" : device_name_nospace;
 
     entry = (tuning_db_entry_t *) bsearch (&s, tuning_db->entry_buf, tuning_db->entry_cnt, sizeof (tuning_db_entry_t), sort_by_tuning_db_entry);
 
@@ -443,7 +436,7 @@ static tuning_db_entry_t *tuning_db_search_real (hashcat_ctx_t *hashcat_ctx, con
 
     // in non-wildcard mode do some additional checks:
 
-    if ((i & 8) == 0)
+    if ((i & 1) == 0)
     {
       // in case we have an alias-name
 
diff --git a/src/user_options.c b/src/user_options.c
index 217e8d3f3..7dbe6567d 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -379,8 +379,8 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
       case IDX_INCREMENT_MAX:
       case IDX_HOOK_THREADS:
       case IDX_BACKEND_DEVICES_VIRTMULTI:
-      case IDX_BACKEND_DEVICES_VIRTHOST:      
-      case IDX_BACKEND_DEVICES_KEEPFREE:      
+      case IDX_BACKEND_DEVICES_VIRTHOST:
+      case IDX_BACKEND_DEVICES_KEEPFREE:
       case IDX_BENCHMARK_MAX:
       case IDX_BENCHMARK_MIN:
       #ifdef WITH_BRAIN
@@ -816,14 +816,14 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
     event_log_error (hashcat_ctx, "Invalid --backend-devices-virthost value specified.");
 
     return -1;
-  }  
+  }
 
   if (user_options->backend_devices_keepfree > 100)
   {
     event_log_error (hashcat_ctx, "Invalid --backend-devices-keepfree value specified.");
 
     return -1;
-  }  
+  }
 
   if (user_options->outfile_format == 0)
   {
@@ -1895,6 +1895,14 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
   }
   #endif
 
+  if (user_options->hwmon == false)
+  {
+    // some algorithm, such as SCRYPT, depend on accurate free memory values
+    // the only way to get them is through low-level APIs such as nvml via hwmon
+
+    user_options->hwmon = true;
+  }
+
   if (user_options->stdout_flag)
   {
     user_options->hwmon               = false;
@@ -3325,8 +3333,8 @@ void user_options_logger (hashcat_ctx_t *hashcat_ctx)
   logfile_top_uint64 (user_options->skip);
   logfile_top_uint   (user_options->attack_mode);
   logfile_top_uint   (user_options->backend_devices_virtmulti);
-  logfile_top_uint   (user_options->backend_devices_virthost);  
-  logfile_top_uint   (user_options->backend_devices_keepfree);  
+  logfile_top_uint   (user_options->backend_devices_virthost);
+  logfile_top_uint   (user_options->backend_devices_keepfree);
   logfile_top_uint   (user_options->benchmark);
   logfile_top_uint   (user_options->benchmark_all);
   logfile_top_uint   (user_options->benchmark_max);
diff --git a/tunings/Module_08900.hctune b/tunings/Module_08900.hctune
index ecaa0e353..46df052b5 100644
--- a/tunings/Module_08900.hctune
+++ b/tunings/Module_08900.hctune
@@ -24,4 +24,3 @@
 # It's better to derive the tuning based on the hash information (handled by the hash-mode plugin).
 # The tunings from the hash-mode plugin may be slightly off, so if you have better values, you can hardcode them here.
 
-
diff --git a/tunings/Module_09300.hctune b/tunings/Module_09300.hctune
index 3277390ab..d98505795 100644
--- a/tunings/Module_09300.hctune
+++ b/tunings/Module_09300.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       9300    1       512     A
-ALIAS_AMD_RX6900XT                              *       9300    1       720     A
-ALIAS_AMD_RX7900XTX                             *       9300    1       840     A
-
diff --git a/tunings/Module_15700.hctune b/tunings/Module_15700.hctune
index c19ae375e..a44bd5a9c 100644
--- a/tunings/Module_15700.hctune
+++ b/tunings/Module_15700.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       15700   1       180     A
-ALIAS_AMD_RX6900XT                              *       15700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       15700   1       92      A
-
diff --git a/tunings/Module_22700.hctune b/tunings/Module_22700.hctune
index be4cd8a4a..c08bd7a51 100644
--- a/tunings/Module_22700.hctune
+++ b/tunings/Module_22700.hctune
@@ -19,7 +19,14 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       22700   1       180     A
-ALIAS_AMD_RX6900XT                              *       22700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       22700   1       92      A
+#Leaving this here as a reference
+#GeForce_GTX_980                                 *       22700   1       28      A
+#GeForce_GTX_1630                                *       22700   1       11      A
+#GeForce_RTX_2080_Ti                             *       22700   1       78      A
+#GeForce_RTX_3090                                *       22700   1       82      A
+#GeForce_RTX_4090                                *       22700   1       180     A
+#ALIAS_AMD_RX480                                 *       22700   1       28      A
+#ALIAS_AMD_Vega64                                *       22700   1       28      A
+#ALIAS_AMD_RX6900XT                              *       22700   1       56      A
+#ALIAS_AMD_RX7900XTX                             *       22700   1       92      A
 
diff --git a/tunings/Module_24000.hctune b/tunings/Module_24000.hctune
index 71f61fe67..52e4b78bb 100644
--- a/tunings/Module_24000.hctune
+++ b/tunings/Module_24000.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       24000   1       180     A
-ALIAS_AMD_RX6900XT                              *       24000   1       56      A
-ALIAS_AMD_RX7900XTX                             *       24000   1       92      A
-
diff --git a/tunings/Module_27700.hctune b/tunings/Module_27700.hctune
index 32b5253b4..095c829f6 100644
--- a/tunings/Module_27700.hctune
+++ b/tunings/Module_27700.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       27700   1       180     A
-ALIAS_AMD_RX6900XT                              *       27700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       27700   1       92      A
-
diff --git a/tunings/Module_28200.hctune b/tunings/Module_28200.hctune
index 50a09b89c..2759beb00 100644
--- a/tunings/Module_28200.hctune
+++ b/tunings/Module_28200.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       28200   1       180     A
-ALIAS_AMD_RX6900XT                              *       28200   1       56      A
-ALIAS_AMD_RX7900XTX                             *       28200   1       92      A
-
diff --git a/tunings/Module_29800.hctune b/tunings/Module_29800.hctune
index 31bea6286..ce9ebd31d 100644
--- a/tunings/Module_29800.hctune
+++ b/tunings/Module_29800.hctune
@@ -18,8 +18,3 @@
 
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
-
-GeForce_RTX_4090                                *       29800   1       180     A
-ALIAS_AMD_RX6900XT                              *       29800   1       56      A
-ALIAS_AMD_RX7900XTX                             *       29800   1       92      A
-