From c87a87f99299741c36fe822e5a9b6b2c071b1d81 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Mon, 9 Jun 2025 11:02:34 +0200
Subject: [PATCH] Improvements to SCRYPT autotuning strategy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

General:

The logic for calculating the SCRYPT workload has been moved
from module_extra_buffer_size() to module_extra_tuningdb_block().
Previously, this function just returned values from a static
tuning file. Now, it actually computes tuning values on the fly
based on the device's resources and SCRYPT parameters. This
was always possible, it just wasn't used that way until now.

After running the calculation, the calculated kernel_accel value
is injected into the tuning database as if it had come from a
file. The tmto value is stored internally.

Users can still override kernel-threads, kernel-accel, and
scrypt-tmto via the command line or via tuningdb file.

module_extra_tuningdb_block():

This is now where kernel_accel and tmto are automatically
calculated.

The logic for accel and tmto is now separated and more
flexible. Whether the user is using defaults, tuningdb entries, or
manual command line overrides, the code logic will try to make
smart choices based on what's actually available on the device.

First, it tries to find a kernel_accel value that fits into
available memory. It starts with a base value and simulates
tmto=1 or 2 (which is typical good on GPU).

It also leaves room for other buffers (like pws[], tmps[], etc.).
If the result is close to the actual processor count,
it gets clamped.

This value is then added to the tuning database, so hashcat can pick
it up during startup.

Once that's set, it derives tmto using available memory, thread
count, and the actual SCRYPT parameters.

module_extra_buffer_size():

This function now just returns the size of the SCRYPT B[] buffer,
based on the tmto that was already calculated.

kernel_threads:

Defaults are now set to 32 threads in most cases. On AMD GPUs,
64 threads might give a slight performance bump, but 32 is more
consistent and reliable.

For very memory-heavy algorithms (like Ethereum Wallet), it
scales down the thread count.

Here's a rough reference for other SCRYPT-based modes:

- 64 MiB: 16 threads
- 256 MiB: 4 threads

Tuning files:

All built-in tuningdb entries have been removed, because they
shouldn’t be needed anymore. But you can still add custom entries
if needed. There’s even a commented-out example in the tuningdb
file for mode 22700.

Free memory handling:

Getting the actual amount of free GPU memory is critical for
this to work right. Unfortunately, none of the common GPGPU APIs
give reliable numbers. We now query low-level interfaces like
SYSFS (AMD) and NVML (NVIDIA). Support for those APIs is in
place already, except for ADL, which still needs to be added.

Because of this, hwmon support (which handles those low-level
queries) can no longer be disabled.
---
 include/ext_nvml.h          |  15 +++
 include/ext_sysfs_amdgpu.h  |   1 +
 include/hwmon.h             |   1 +
 include/modules.h           |   2 +-
 include/tuningdb.h          |   2 +-
 include/types.h             |   3 +-
 src/backend.c               |  42 +++++-
 src/ext_nvml.c              |  22 +++
 src/ext_sysfs_amdgpu.c      |  52 +++++++
 src/hwmon.c                 |  73 +++++++++-
 src/modules/module_08900.c  | 241 ++++++++++++---------------------
 src/modules/module_09300.c  | 241 ++++++++++++---------------------
 src/modules/module_15700.c  | 241 ++++++++++++---------------------
 src/modules/module_22700.c  | 241 ++++++++++++---------------------
 src/modules/module_24000.c  | 261 ++++++++++++------------------------
 src/modules/module_27700.c  | 241 ++++++++++++---------------------
 src/modules/module_28200.c  | 241 ++++++++++++---------------------
 src/modules/module_29800.c  | 241 ++++++++++++---------------------
 src/tuningdb.c              |  17 +--
 src/user_options.c          |  20 ++-
 tunings/Module_08900.hctune |   1 -
 tunings/Module_09300.hctune |   4 -
 tunings/Module_15700.hctune |   4 -
 tunings/Module_22700.hctune |  13 +-
 tunings/Module_24000.hctune |   4 -
 tunings/Module_27700.hctune |   4 -
 tunings/Module_28200.hctune |   4 -
 tunings/Module_29800.hctune |   5 -
 28 files changed, 941 insertions(+), 1296 deletions(-)

diff --git a/include/ext_nvml.h b/include/ext_nvml.h
index 02c5d490c..0215e1a32 100644
--- a/include/ext_nvml.h
+++ b/include/ext_nvml.h
@@ -161,6 +161,18 @@ typedef enum nvmlGom_enum
  * */
 #define nvmlClocksThrottleReasonNone                      0x0000000000000000LL
 
+/**
+ * Memory allocation information for a device (v1).
+ * The total amount is equal to the sum of the amounts of free and used memory.
+ */
+typedef struct nvmlMemory_st
+{
+    unsigned long long total;        //!< Total physical device memory (in bytes)
+    unsigned long long free;         //!< Unallocated device memory (in bytes)
+    unsigned long long used;         //!< Sum of Reserved and Allocated device memory (in bytes).
+                                     //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+
 /*
  * End of declarations from nvml.h
  **/
@@ -191,6 +203,7 @@ typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREAS
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_SET_COMPUTEMODE) (nvmlDevice_t, nvmlComputeMode_t);
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_SET_OPERATIONMODE) (nvmlDevice_t, nvmlGpuOperationMode_t);
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_PCIINFO) (nvmlDevice_t, nvmlPciInfo_t *);
+typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_MEMORYINFO) (nvmlDevice_t, nvmlMemory_t *);
 
 typedef struct hm_nvml_lib
 {
@@ -212,6 +225,7 @@ typedef struct hm_nvml_lib
   NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS nvmlDeviceGetCurrentClocksThrottleReasons;
   NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS nvmlDeviceGetSupportedClocksThrottleReasons;
   NVML_DEVICE_GET_PCIINFO nvmlDeviceGetPciInfo;
+  NVML_DEVICE_GET_MEMORYINFO nvmlDeviceGetMemoryInfo;
 
 } hm_nvml_lib_t;
 
@@ -232,5 +246,6 @@ int hm_NVML_nvmlDeviceGetClockInfo (void *hashcat_ctx, nvmlDevice_t device, nvml
 int hm_NVML_nvmlDeviceGetTemperatureThreshold (void *hashcat_ctx, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
 int hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (void *hashcat_ctx, nvmlDevice_t device, unsigned int *currLinkWidth);
 int hm_NVML_nvmlDeviceGetPciInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlPciInfo_t *pci);
+int hm_NVML_nvmlDeviceGetMemoryInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlMemory_t *mem);
 
 #endif // HC_NVML_H
diff --git a/include/ext_sysfs_amdgpu.h b/include/ext_sysfs_amdgpu.h
index 50c0dc569..d381d9cec 100644
--- a/include/ext_sysfs_amdgpu.h
+++ b/include/ext_sysfs_amdgpu.h
@@ -34,5 +34,6 @@ int hm_SYSFS_AMDGPU_get_pp_dpm_sclk (void *hashcat_ctx, const int backend_device
 int hm_SYSFS_AMDGPU_get_pp_dpm_mclk (void *hashcat_ctx, const int backend_device_idx, int *val);
 int hm_SYSFS_AMDGPU_get_pp_dpm_pcie (void *hashcat_ctx, const int backend_device_idx, int *val);
 int hm_SYSFS_AMDGPU_get_gpu_busy_percent (void *hashcat_ctx, const int backend_device_idx, int *val);
+int hm_SYSFS_AMDGPU_get_mem_info_vram_used (void *hashcat_ctx, const int backend_device_idx, u64 *val);
 
 #endif // HC_EXT_SYSFS_AMDGPU_H
diff --git a/include/hwmon.h b/include/hwmon.h
index 545e22b2d..3d4bd7940 100644
--- a/include/hwmon.h
+++ b/include/hwmon.h
@@ -24,6 +24,7 @@ int hm_get_utilization_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, cons
 int hm_get_memoryspeed_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 int hm_get_corespeed_with_devices_idx          (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 int hm_get_throttle_with_devices_idx           (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+u64 hm_get_memoryused_with_devices_idx         (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 
 int  hwmon_ctx_init    (hashcat_ctx_t *hashcat_ctx);
 void hwmon_ctx_destroy (hashcat_ctx_t *hashcat_ctx);
diff --git a/include/modules.h b/include/modules.h
index aed8403ce..713b3f46f 100644
--- a/include/modules.h
+++ b/include/modules.h
@@ -20,7 +20,7 @@ u32         module_dgst_pos2                (MAYBE_UNUSED const hashconfig_t *ha
 u32         module_dgst_pos3                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u32         module_dgst_size                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u64         module_esalt_size               (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
-const char *module_extra_tuningdb_block     (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes);
+const char *module_extra_tuningdb_block     (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel);
 u32         module_forced_outfile_format    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u32         module_hash_category            (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 const char *module_hash_name                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
diff --git a/include/tuningdb.h b/include/tuningdb.h
index 608a25cfd..b2cafff67 100644
--- a/include/tuningdb.h
+++ b/include/tuningdb.h
@@ -17,7 +17,7 @@ int sort_by_tuning_db_entry (const void *v1, const void *v2);
 int  tuning_db_init    (hashcat_ctx_t *hashcat_ctx);
 void tuning_db_destroy (hashcat_ctx_t *hashcat_ctx);
 
-bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num, const int source);
+bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num);
 tuning_db_entry_t *tuning_db_search (hashcat_ctx_t *hashcat_ctx, const char *device_name, const cl_device_type device_type, int attack_mode, const int hash_mode);
 
 #endif // HC_TUNINGDB_H
diff --git a/include/types.h b/include/types.h
index d02c1b783..8f265ab14 100644
--- a/include/types.h
+++ b/include/types.h
@@ -2067,6 +2067,7 @@ typedef struct hm_attrs
   bool threshold_slowdown_get_supported;
   bool throttle_get_supported;
   bool utilization_get_supported;
+  bool memoryused_get_supported;
 
 } hm_attrs_t;
 
@@ -3013,7 +3014,7 @@ typedef struct module_ctx
   u32         (*module_dgst_size)               (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   bool        (*module_dictstat_disable)        (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   u64         (*module_esalt_size)              (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
-  const char *(*module_extra_tuningdb_block)    (const hashconfig_t *, const user_options_t *, const user_options_extra_t *, const backend_ctx_t *, const hashes_t *);
+  const char *(*module_extra_tuningdb_block)    (const hashconfig_t *, const user_options_t *, const user_options_extra_t *, const backend_ctx_t *, const hashes_t *, const u32, const u32);
   u32         (*module_forced_outfile_format)   (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   u32         (*module_hash_category)           (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
   const char *(*module_hash_name)               (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
diff --git a/src/backend.c b/src/backend.c
index c5b95a659..80d1bfd60 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -24,6 +24,7 @@
 #include "dynloader.h"
 #include "backend.h"
 #include "terminal.h"
+#include "hwmon.h"
 
 #if defined (__linux__)
 static const char *const  dri_card0_path = "/dev/dri/card0";
@@ -9649,7 +9650,44 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (module_ctx->module_extra_tuningdb_block != MODULE_DEFAULT)
     {
-      const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes);
+      // We need this because we can't trust CUDA/HIP to give us the real free device memory
+      // The only way to do so is through low level APIs
+
+      for (int i = 0; i < 10; i++)
+      {
+        const u64 used_bytes = hm_get_memoryused_with_devices_idx (hashcat_ctx, device_id);
+
+        if (used_bytes)
+        {
+          if ((used_bytes > (2ULL * 1024 * 1024 * 1024))
+           || (used_bytes > (device_param->device_global_mem * 0.5)))
+          {
+            event_log_warning (hashcat_ctx, "* Device #%u: Memory usage is too high: %" PRIu64 "/%" PRIu64 ", waiting...", device_id + 1, used_bytes, device_param->device_global_mem);
+
+            sleep (1);
+
+            continue;
+          }
+
+          device_param->device_available_mem -= used_bytes;
+
+          break;
+        }
+        else
+        {
+          break;
+        }
+      }
+
+      u32 _kernel_accel = 0;
+
+      tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+
+      if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
+
+      if (user_options->kernel_accel_chgd == true) _kernel_accel = user_options->kernel_accel;
+
+      const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes, device_id, _kernel_accel);
 
       char *lines_buf = hcstrdup (extra_tuningdb_block);
 
@@ -9669,7 +9707,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
         if (next[0] == '#') continue;
 
-        tuning_db_process_line (hashcat_ctx, next, line_num, 2);
+        tuning_db_process_line (hashcat_ctx, next, line_num);
 
       } while ((next = strtok_r ((char *) NULL, "\n", &saveptr)) != NULL);
 
diff --git a/src/ext_nvml.c b/src/ext_nvml.c
index 25911df14..e6d49cd08 100644
--- a/src/ext_nvml.c
+++ b/src/ext_nvml.c
@@ -149,6 +149,7 @@ int nvml_init (void *hashcat_ctx)
   HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrentClocksThrottleReasons, NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS, NVML, 0);
   HC_LOAD_FUNC(nvml, nvmlDeviceGetSupportedClocksThrottleReasons, NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS, NVML, 0);
   HC_LOAD_FUNC(nvml, nvmlDeviceGetPciInfo, NVML_DEVICE_GET_PCIINFO, NVML, 0);
+  HC_LOAD_FUNC(nvml, nvmlDeviceGetMemoryInfo, NVML_DEVICE_GET_MEMORYINFO, NVML, 0);
 
   return 0;
 }
@@ -392,3 +393,24 @@ int hm_NVML_nvmlDeviceGetPciInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlPc
 
   return 0;
 }
+
+int hm_NVML_nvmlDeviceGetMemoryInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlMemory_t *mem)
+{
+  hwmon_ctx_t *hwmon_ctx = ((hashcat_ctx_t *) hashcat_ctx)->hwmon_ctx;
+
+  NVML_PTR *nvml = (NVML_PTR *) hwmon_ctx->hm_nvml;
+
+  const nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetMemoryInfo (device, mem);
+
+  if (nvml_rc != NVML_SUCCESS)
+  {
+    const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+    event_log_error (hashcat_ctx, "nvmlDeviceGetMemoryInfo(): %s", string);
+
+    return -1;
+  }
+
+  return 0;
+}
+
diff --git a/src/ext_sysfs_amdgpu.c b/src/ext_sysfs_amdgpu.c
index 1aa53b210..70f071649 100644
--- a/src/ext_sysfs_amdgpu.c
+++ b/src/ext_sysfs_amdgpu.c
@@ -441,3 +441,55 @@ int hm_SYSFS_AMDGPU_get_gpu_busy_percent (void *hashcat_ctx, const int backend_d
 
   return 0;
 }
+
+int hm_SYSFS_AMDGPU_get_mem_info_vram_used (void *hashcat_ctx, const int backend_device_idx, u64 *val)
+{
+  char *syspath = hm_SYSFS_AMDGPU_get_syspath_device (hashcat_ctx, backend_device_idx);
+
+  if (syspath == NULL) return -1;
+
+  char *path;
+
+  hc_asprintf (&path, "%s/mem_info_vram_used", syspath);
+
+  hcfree (syspath);
+
+  HCFILE fp;
+
+  if (hc_fopen (&fp, path, "r") == false)
+  {
+    event_log_error (hashcat_ctx, "%s: %s", path, strerror (errno));
+
+    hcfree (path);
+
+    return -1;
+  }
+
+  u64 mem_info_vram_used = 0;
+
+  while (!hc_feof (&fp))
+  {
+    char buf[HCBUFSIZ_TINY];
+
+    char *ptr = hc_fgets (buf, sizeof (buf), &fp);
+
+    if (ptr == NULL) continue;
+
+    size_t len = strlen (ptr);
+
+    if (len < 1) continue;
+
+    int rc = sscanf (ptr, "%" PRIu64, &mem_info_vram_used);
+
+    if (rc == 1) break;
+  }
+
+  hc_fclose (&fp);
+
+  *val = mem_info_vram_used;
+
+  hcfree (path);
+
+  return 0;
+}
+
diff --git a/src/hwmon.c b/src/hwmon.c
index a0f24c644..4f5264b3d 100644
--- a/src/hwmon.c
+++ b/src/hwmon.c
@@ -1214,6 +1214,60 @@ int hm_get_throttle_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int back
   return -1;
 }
 
+u64 hm_get_memoryused_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
+{
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (hwmon_ctx->enabled == false) return 0;
+
+  if (hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported == false) return 0;
+
+  if ((backend_ctx->devices_param[backend_device_idx].is_opencl == true) || (backend_ctx->devices_param[backend_device_idx].is_hip == true) || (backend_ctx->devices_param[backend_device_idx].is_cuda == true))
+  {
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD) || (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
+      {
+        if (hwmon_ctx->hm_sysfs_amdgpu)
+        {
+          u64 used = 0;
+
+          if (hm_SYSFS_AMDGPU_get_mem_info_vram_used (hashcat_ctx, backend_device_idx, &used) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+            return 0;
+          }
+
+          return used;
+        }
+      }
+
+      if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+      {
+        if (hwmon_ctx->hm_nvml)
+        {
+          nvmlMemory_t mem;
+
+          if (hm_NVML_nvmlDeviceGetMemoryInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &mem) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+            return 0;
+          }
+
+          return mem.used;
+        }
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+  return 0;
+}
+
 int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 {
   bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
@@ -1227,12 +1281,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
 
-  #if !defined (WITH_HWMON)
-  return 0;
-  #endif // WITH_HWMON
+  //#if !defined (WITH_HWMON)
+  //return 0;
+  //#endif // WITH_HWMON
 
   if (user_options->usage          > 0)     return 0;
-  if (user_options->backend_info   > 0)     return 0;
+  //if (user_options->backend_info   > 0)     return 0;
 
   if (user_options->hash_info     == true)  return 0;
   if (user_options->keyspace      == true)  return 0;
@@ -1241,7 +1295,9 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
   if (user_options->stdout_flag   == true)  return 0;
   if (user_options->version       == true)  return 0;
   if (user_options->identify      == true)  return 0;
-  if (user_options->hwmon         == false) return 0;
+  //we need hwmon support to get free memory per device support
+  //its a joke, but there's no way around
+  //if (user_options->hwmon         == false) return 0;
 
   hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
 
@@ -1387,6 +1443,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
               hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
               hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
               hm_adapters_nvml[device_id].utilization_get_supported         = true;
+              hm_adapters_nvml[device_id].memoryused_get_supported          = true;
             }
           }
         }
@@ -1419,6 +1476,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
               hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
               hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
               hm_adapters_nvml[device_id].utilization_get_supported         = true;
+              hm_adapters_nvml[device_id].memoryused_get_supported          = true;
             }
           }
         }
@@ -1640,6 +1698,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
             hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
             hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
             hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
+            hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
           }
         }
       }
@@ -1746,6 +1805,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
         hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
         hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
         hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_nvml[device_id].memoryused_get_supported;
       }
 
       if (hwmon_ctx->hm_nvapi)
@@ -1875,6 +1935,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
             hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_sysfs_amdgpu[device_id].threshold_slowdown_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_sysfs_amdgpu[device_id].throttle_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported;
+            hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported;
           }
         }
 
@@ -1895,6 +1956,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
             hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
             hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
+            hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_nvml[device_id].memoryused_get_supported;
           }
 
           if (hwmon_ctx->hm_nvapi)
@@ -1927,6 +1989,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     hm_get_threshold_slowdown_with_devices_idx (hashcat_ctx, backend_devices_idx);
     hm_get_throttle_with_devices_idx           (hashcat_ctx, backend_devices_idx);
     hm_get_utilization_with_devices_idx        (hashcat_ctx, backend_devices_idx);
+    hm_get_memoryused_with_devices_idx         (hashcat_ctx, backend_devices_idx);
   }
 
   FREE_ADAPTERS;
diff --git a/src/modules/module_08900.c b/src/modules/module_08900.c
index 0865e8575..42fd456be 100644
--- a/src/modules/module_08900.c
+++ b/src/modules/module_08900.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_SCRYPT = "SCRYPT";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -527,7 +464,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_09300.c b/src/modules/module_09300.c
index 8f92e7fce..4f0f5bbb5 100644
--- a/src/modules/module_09300.c
+++ b/src/modules/module_09300.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_CISCO9 = "$9$";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 1;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = req1 * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -488,7 +425,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_15700.c b/src/modules/module_15700.c
index c7a357dd0..063106d2f 100644
--- a/src/modules/module_15700.c
+++ b/src/modules/module_15700.c
@@ -56,6 +56,8 @@ typedef struct ethereum_scrypt
 
 static const char *SIGNATURE_ETHEREUM_SCRYPT = "$ethereum$s";
 
+static const u32 SCRYPT_THREADS = 4;
+
 static const u64 SCRYPT_N = 262144;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -74,9 +76,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 4;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -98,90 +107,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -193,115 +234,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -587,7 +524,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_22700.c b/src/modules/module_22700.c
index 30c106625..1b9113bd4 100644
--- a/src/modules/module_22700.c
+++ b/src/modules/module_22700.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_MULTIBIT = "$multibit$";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -526,7 +463,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_24000.c b/src/modules/module_24000.c
index 62217f2d8..159acbed0 100644
--- a/src/modules/module_24000.c
+++ b/src/modules/module_24000.c
@@ -57,27 +57,13 @@ typedef struct bestcrypt_scrypt
 // 16 is actually a bit low, we may need to change this depending on user response
 
 static const char *SIGNATURE_BESTCRYPT_SCRYPT = "$bcve$";
-static const u32   SCRYPT_MAX_ACCEL          = 256;
-static const u32   SCRYPT_MAX_THREADS        = 4;
+
+static const u32 SCRYPT_THREADS = 16;
 
 static const u64 SCRYPT_N = 32768;
 static const u64 SCRYPT_R = 16;
 static const u64 SCRYPT_P = 1;
 
-u32 module_kernel_accel_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_accel_min = 1;
-
-  return kernel_accel_min;
-}
-
-u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_accel_max = (user_options->kernel_accel_chgd == true) ? user_options->kernel_accel : SCRYPT_MAX_ACCEL;
-
-  return kernel_accel_max;
-}
-
 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_min = 1;
@@ -94,14 +80,14 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
 
 u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_min = 1;
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_min;
 }
 
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_MAX_THREADS;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -123,90 +109,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -215,121 +233,14 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   // we need to set the self-test hash settings to pass the self-test
   // the decoder for the self-test is called after this function
 
-  const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
-  const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
+  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
+  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u32 tmto_start = 1;
-  u32 tmto_stop  = 6;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-/*
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-*/
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-/*
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-*/
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks;
-//    + size_pws_pre
-//    + size_pws_base;
-/*
-    + EXTRA_SPACE;
-*/
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u32 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -593,8 +504,8 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_hook_size                = MODULE_DEFAULT;
   module_ctx->module_jit_build_options        = module_jit_build_options;
   module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
-  module_ctx->module_kernel_accel_max         = module_kernel_accel_max;
-  module_ctx->module_kernel_accel_min         = module_kernel_accel_min;
+  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
diff --git a/src/modules/module_27700.c b/src/modules/module_27700.c
index 089deb5fa..fb3a31fa1 100644
--- a/src/modules/module_27700.c
+++ b/src/modules/module_27700.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_MULTIBIT = "$multibit$";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -84,90 +93,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -550,7 +487,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_28200.c b/src/modules/module_28200.c
index 86a636adf..52a7adbdd 100644
--- a/src/modules/module_28200.c
+++ b/src/modules/module_28200.c
@@ -57,6 +57,8 @@ typedef struct exodus
 
 static const char *SIGNATURE_EXODUS = "EXODUS";
 
+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@@ -75,9 +77,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -96,90 +105,122 @@ u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED
   return esalt_size;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -191,115 +232,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -634,7 +571,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/modules/module_29800.c b/src/modules/module_29800.c
index d1be6be39..633ef1978 100644
--- a/src/modules/module_29800.c
+++ b/src/modules/module_29800.c
@@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,
 
 static const char *SIGNATURE_BISQ = "$bisq$";
 
+static const u32 SCRYPT_THREADS = 16;
+
 static const u64 SCRYPT_N = 32768;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 6;
@@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
 
   return kernel_threads_max;
 }
@@ -91,90 +100,122 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
   // we enforce the same configuration for all hashes, so this should be fine
 
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;
 
   int   lines_sz  = 4096;
   char *lines_buf = hcmalloc (lines_sz);
   int   lines_pos = 0;
 
-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
   {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority
 
-    if (device_param->skipped == true) continue;
-
-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
-
-    char *new_device_name = hcstrdup (device_param->device_name);
-
-    for (size_t i = 0; i < strlen (new_device_name); i++)
-    {
-      if (new_device_name[i] == ' ') new_device_name[i] = '_';
-    }
-
-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically
 
     if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
     {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
       {
-        const float multi = (float) avail / req1;
+        const float multi = (float) available_mem / size_per_accel;
 
-        accel = multi;
+        int accel_multi;
 
-        for (int i = 1; i <= 4; i++) // this is tmto
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
         {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
         }
       }
       else
       {
         for (int i = 1; i <= 8; i++)
         {
-          if ((avail * 2) > (req1 * accel))
+          if ((size_per_accel * device_processors * i) < available_mem)
           {
-            accel = device_param->device_processors * i;
+            kernel_accel_new = device_processors * i;
           }
         }
       }
     }
     else
     {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
       for (int i = 1; i <= 8; i++)
       {
-        if (avail > (req1 * accel))
+        if ((size_per_accel * device_processors * i) < available_mem)
         {
-          accel = device_param->device_processors * i;
+          kernel_accel_new = device_processors * i;
         }
       }
     }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
-
-    hcfree (new_device_name);
   }
 
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
   return lines_buf;
 }
 
@@ -186,115 +227,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
   const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
   const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
 
-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
 
-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
 
-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }
 
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -557,7 +494,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
diff --git a/src/tuningdb.c b/src/tuningdb.c
index 406359ab4..1c5e6cb32 100644
--- a/src/tuningdb.c
+++ b/src/tuningdb.c
@@ -43,11 +43,6 @@ int sort_by_tuning_db_entry (const void *v1, const void *v2)
 
   if (res3 != 0) return (res3);
 
-  const int res4 = t1->source
-                 - t2->source;
-
-  if (res4 != 0) return (res4);
-
   return 0;
 }
 
@@ -118,7 +113,7 @@ int tuning_db_init (hashcat_ctx_t *hashcat_ctx)
 
       if (line_buf[0] == '#') continue;
 
-      tuning_db_process_line (hashcat_ctx, line_buf, line_num, 1);
+      tuning_db_process_line (hashcat_ctx, line_buf, line_num);
     }
 
     hcfree (buf);
@@ -167,7 +162,7 @@ void tuning_db_destroy (hashcat_ctx_t *hashcat_ctx)
   memset (tuning_db, 0, sizeof (tuning_db_t));
 }
 
-bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num, const int source)
+bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num)
 {
   tuning_db_t           *tuning_db          = hashcat_ctx->tuning_db;
   user_options_extra_t  *user_options_extra = hashcat_ctx->user_options_extra;
@@ -353,7 +348,6 @@ bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, c
     entry->vector_width = vector_width;
     entry->kernel_accel = kernel_accel;
     entry->kernel_loops = kernel_loops;
-    entry->source       = source;
 
     tuning_db->entry_cnt++;
   }
@@ -430,12 +424,11 @@ static tuning_db_entry_t *tuning_db_search_real (hashcat_ctx_t *hashcat_ctx, con
 
   // this will produce all 2^3 combinations required
 
-  for (i = 0; i < 16; i++)
+  for (i = 0; i < 8; i++)
   {
-    s.source      = (i & 1) ?   2 : 1;
+    s.device_name = (i & 1) ? "*" : device_name_nospace;
     s.attack_mode = (i & 2) ?  -1 : attack_mode;
     s.hash_mode   = (i & 4) ?  -1 : hash_mode;
-    s.device_name = (i & 8) ? "*" : device_name_nospace;
 
     entry = (tuning_db_entry_t *) bsearch (&s, tuning_db->entry_buf, tuning_db->entry_cnt, sizeof (tuning_db_entry_t), sort_by_tuning_db_entry);
 
@@ -443,7 +436,7 @@ static tuning_db_entry_t *tuning_db_search_real (hashcat_ctx_t *hashcat_ctx, con
 
     // in non-wildcard mode do some additional checks:
 
-    if ((i & 8) == 0)
+    if ((i & 1) == 0)
     {
       // in case we have an alias-name
 
diff --git a/src/user_options.c b/src/user_options.c
index 217e8d3f3..7dbe6567d 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -379,8 +379,8 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
       case IDX_INCREMENT_MAX:
       case IDX_HOOK_THREADS:
       case IDX_BACKEND_DEVICES_VIRTMULTI:
-      case IDX_BACKEND_DEVICES_VIRTHOST:      
-      case IDX_BACKEND_DEVICES_KEEPFREE:      
+      case IDX_BACKEND_DEVICES_VIRTHOST:
+      case IDX_BACKEND_DEVICES_KEEPFREE:
       case IDX_BENCHMARK_MAX:
       case IDX_BENCHMARK_MIN:
       #ifdef WITH_BRAIN
@@ -816,14 +816,14 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
     event_log_error (hashcat_ctx, "Invalid --backend-devices-virthost value specified.");
 
     return -1;
-  }  
+  }
 
   if (user_options->backend_devices_keepfree > 100)
   {
     event_log_error (hashcat_ctx, "Invalid --backend-devices-keepfree value specified.");
 
     return -1;
-  }  
+  }
 
   if (user_options->outfile_format == 0)
   {
@@ -1895,6 +1895,14 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
   }
   #endif
 
+  if (user_options->hwmon == false)
+  {
+    // some algorithm, such as SCRYPT, depend on accurate free memory values
+    // the only way to get them is through low-level APIs such as nvml via hwmon
+
+    user_options->hwmon = true;
+  }
+
   if (user_options->stdout_flag)
   {
     user_options->hwmon               = false;
@@ -3325,8 +3333,8 @@ void user_options_logger (hashcat_ctx_t *hashcat_ctx)
   logfile_top_uint64 (user_options->skip);
   logfile_top_uint   (user_options->attack_mode);
   logfile_top_uint   (user_options->backend_devices_virtmulti);
-  logfile_top_uint   (user_options->backend_devices_virthost);  
-  logfile_top_uint   (user_options->backend_devices_keepfree);  
+  logfile_top_uint   (user_options->backend_devices_virthost);
+  logfile_top_uint   (user_options->backend_devices_keepfree);
   logfile_top_uint   (user_options->benchmark);
   logfile_top_uint   (user_options->benchmark_all);
   logfile_top_uint   (user_options->benchmark_max);
diff --git a/tunings/Module_08900.hctune b/tunings/Module_08900.hctune
index ecaa0e353..46df052b5 100644
--- a/tunings/Module_08900.hctune
+++ b/tunings/Module_08900.hctune
@@ -24,4 +24,3 @@
 # It's better to derive the tuning based on the hash information (handled by the hash-mode plugin).
 # The tunings from the hash-mode plugin may be slightly off, so if you have better values, you can hardcode them here.
 
-
diff --git a/tunings/Module_09300.hctune b/tunings/Module_09300.hctune
index 3277390ab..d98505795 100644
--- a/tunings/Module_09300.hctune
+++ b/tunings/Module_09300.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       9300    1       512     A
-ALIAS_AMD_RX6900XT                              *       9300    1       720     A
-ALIAS_AMD_RX7900XTX                             *       9300    1       840     A
-
diff --git a/tunings/Module_15700.hctune b/tunings/Module_15700.hctune
index c19ae375e..a44bd5a9c 100644
--- a/tunings/Module_15700.hctune
+++ b/tunings/Module_15700.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       15700   1       180     A
-ALIAS_AMD_RX6900XT                              *       15700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       15700   1       92      A
-
diff --git a/tunings/Module_22700.hctune b/tunings/Module_22700.hctune
index be4cd8a4a..c08bd7a51 100644
--- a/tunings/Module_22700.hctune
+++ b/tunings/Module_22700.hctune
@@ -19,7 +19,14 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       22700   1       180     A
-ALIAS_AMD_RX6900XT                              *       22700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       22700   1       92      A
+#Leaving this here as a reference
+#GeForce_GTX_980                                 *       22700   1       28      A
+#GeForce_GTX_1630                                *       22700   1       11      A
+#GeForce_RTX_2080_Ti                             *       22700   1       78      A
+#GeForce_RTX_3090                                *       22700   1       82      A
+#GeForce_RTX_4090                                *       22700   1       180     A
+#ALIAS_AMD_RX480                                 *       22700   1       28      A
+#ALIAS_AMD_Vega64                                *       22700   1       28      A
+#ALIAS_AMD_RX6900XT                              *       22700   1       56      A
+#ALIAS_AMD_RX7900XTX                             *       22700   1       92      A
 
diff --git a/tunings/Module_24000.hctune b/tunings/Module_24000.hctune
index 71f61fe67..52e4b78bb 100644
--- a/tunings/Module_24000.hctune
+++ b/tunings/Module_24000.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       24000   1       180     A
-ALIAS_AMD_RX6900XT                              *       24000   1       56      A
-ALIAS_AMD_RX7900XTX                             *       24000   1       92      A
-
diff --git a/tunings/Module_27700.hctune b/tunings/Module_27700.hctune
index 32b5253b4..095c829f6 100644
--- a/tunings/Module_27700.hctune
+++ b/tunings/Module_27700.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       27700   1       180     A
-ALIAS_AMD_RX6900XT                              *       27700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       27700   1       92      A
-
diff --git a/tunings/Module_28200.hctune b/tunings/Module_28200.hctune
index 50a09b89c..2759beb00 100644
--- a/tunings/Module_28200.hctune
+++ b/tunings/Module_28200.hctune
@@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
 
-GeForce_RTX_4090                                *       28200   1       180     A
-ALIAS_AMD_RX6900XT                              *       28200   1       56      A
-ALIAS_AMD_RX7900XTX                             *       28200   1       92      A
-
diff --git a/tunings/Module_29800.hctune b/tunings/Module_29800.hctune
index 31bea6286..ce9ebd31d 100644
--- a/tunings/Module_29800.hctune
+++ b/tunings/Module_29800.hctune
@@ -18,8 +18,3 @@
 
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
-
-GeForce_RTX_4090                                *       29800   1       180     A
-ALIAS_AMD_RX6900XT                              *       29800   1       56      A
-ALIAS_AMD_RX7900XTX                             *       29800   1       92      A
-