Improvements to SCRYPT autotuning strategy

General: The logic for calculating the SCRYPT workload has been moved from module_extra_buffer_size() to module_extra_tuningdb_block(). Previously, this function just returned values from a static tuning file. Now, it actually computes tuning values on the fly based on the device's resources and SCRYPT parameters. This was always possible, it just wasn't used that way until now. After running the calculation, the calculated kernel_accel value is injected into the tuning database as if it had come from a file. The tmto value is stored internally. Users can still override kernel-threads, kernel-accel, and scrypt-tmto via the command line or via tuningdb file. module_extra_tuningdb_block(): This is now where kernel_accel and tmto are automatically calculated. The logic for accel and tmto is now separated and more flexible. Whether the user is using defaults, tuningdb entries, or manual command line overrides, the code logic will try to make smart choices based on what's actually available on the device. First, it tries to find a kernel_accel value that fits into available memory. It starts with a base value and simulates tmto=1 or 2 (which is typical good on GPU). It also leaves room for other buffers (like pws[], tmps[], etc.). If the result is close to the actual processor count, it gets clamped. This value is then added to the tuning database, so hashcat can pick it up during startup. Once that's set, it derives tmto using available memory, thread count, and the actual SCRYPT parameters. module_extra_buffer_size(): This function now just returns the size of the SCRYPT B[] buffer, based on the tmto that was already calculated. kernel_threads: Defaults are now set to 32 threads in most cases. On AMD GPUs, 64 threads might give a slight performance bump, but 32 is more consistent and reliable. For very memory-heavy algorithms (like Ethereum Wallet), it scales down the thread count. Here's a rough reference for other SCRYPT-based modes: - 64 MiB: 16 threads - 256 MiB: 4 threads Tuning files: All built-in tuningdb entries have been removed, because they shouldn’t be needed anymore. But you can still add custom entries if needed. There’s even a commented-out example in the tuningdb file for mode 22700. Free memory handling: Getting the actual amount of free GPU memory is critical for this to work right. Unfortunately, none of the common GPGPU APIs give reliable numbers. We now query low-level interfaces like SYSFS (AMD) and NVML (NVIDIA). Support for those APIs is in place already, except for ADL, which still needs to be added. Because of this, hwmon support (which handles those low-level queries) can no longer be disabled.
2025-07-23 15:08:37 +00:00 · 2025-06-09 11:02:34 +02:00 · 2025-06-09 11:02:34 +02:00 · c87a87f992
commit c87a87f992
parent c7d96b40e9
28 changed files with 941 additions and 1296 deletions
--- a/include/ext_nvml.h
+++ b/include/ext_nvml.h
@ -161,6 +161,18 @@ typedef enum nvmlGom_enum
 * */
 #define nvmlClocksThrottleReasonNone                      0x0000000000000000LL

+/**
+ * Memory allocation information for a device (v1).
+ * The total amount is equal to the sum of the amounts of free and used memory.
+ */
+typedef struct nvmlMemory_st
+{
+    unsigned long long total;        //!< Total physical device memory (in bytes)
+    unsigned long long free;         //!< Unallocated device memory (in bytes)
+    unsigned long long used;         //!< Sum of Reserved and Allocated device memory (in bytes).
+                                     //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+
 /*
 * End of declarations from nvml.h
 **/
@ -191,6 +203,7 @@ typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREAS
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_SET_COMPUTEMODE) (nvmlDevice_t, nvmlComputeMode_t);
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_SET_OPERATIONMODE) (nvmlDevice_t, nvmlGpuOperationMode_t);
 typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_PCIINFO) (nvmlDevice_t, nvmlPciInfo_t *);
+typedef nvmlReturn_t (*NVML_API_CALL NVML_DEVICE_GET_MEMORYINFO) (nvmlDevice_t, nvmlMemory_t *);

 typedef struct hm_nvml_lib
 {
@ -212,6 +225,7 @@ typedef struct hm_nvml_lib
  NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS nvmlDeviceGetCurrentClocksThrottleReasons;
  NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS nvmlDeviceGetSupportedClocksThrottleReasons;
  NVML_DEVICE_GET_PCIINFO nvmlDeviceGetPciInfo;
+  NVML_DEVICE_GET_MEMORYINFO nvmlDeviceGetMemoryInfo;

 } hm_nvml_lib_t;

@ -232,5 +246,6 @@ int hm_NVML_nvmlDeviceGetClockInfo (void *hashcat_ctx, nvmlDevice_t device, nvml
 int hm_NVML_nvmlDeviceGetTemperatureThreshold (void *hashcat_ctx, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
 int hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (void *hashcat_ctx, nvmlDevice_t device, unsigned int *currLinkWidth);
 int hm_NVML_nvmlDeviceGetPciInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlPciInfo_t *pci);
+int hm_NVML_nvmlDeviceGetMemoryInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlMemory_t *mem);

 #endif // HC_NVML_H
--- a/include/ext_sysfs_amdgpu.h
+++ b/include/ext_sysfs_amdgpu.h
@ -34,5 +34,6 @@ int hm_SYSFS_AMDGPU_get_pp_dpm_sclk (void *hashcat_ctx, const int backend_device
 int hm_SYSFS_AMDGPU_get_pp_dpm_mclk (void *hashcat_ctx, const int backend_device_idx, int *val);
 int hm_SYSFS_AMDGPU_get_pp_dpm_pcie (void *hashcat_ctx, const int backend_device_idx, int *val);
 int hm_SYSFS_AMDGPU_get_gpu_busy_percent (void *hashcat_ctx, const int backend_device_idx, int *val);
+int hm_SYSFS_AMDGPU_get_mem_info_vram_used (void *hashcat_ctx, const int backend_device_idx, u64 *val);

 #endif // HC_EXT_SYSFS_AMDGPU_H
--- a/include/hwmon.h
+++ b/include/hwmon.h
@ -24,6 +24,7 @@ int hm_get_utilization_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, cons
 int hm_get_memoryspeed_with_devices_idx        (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 int hm_get_corespeed_with_devices_idx          (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
 int hm_get_throttle_with_devices_idx           (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);
+u64 hm_get_memoryused_with_devices_idx         (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx);

 int  hwmon_ctx_init    (hashcat_ctx_t *hashcat_ctx);
 void hwmon_ctx_destroy (hashcat_ctx_t *hashcat_ctx);
--- a/include/modules.h
+++ b/include/modules.h
@ -20,7 +20,7 @@ u32         module_dgst_pos2                (MAYBE_UNUSED const hashconfig_t *ha
 u32         module_dgst_pos3                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u32         module_dgst_size                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u64         module_esalt_size               (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
-const char *module_extra_tuningdb_block     (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes);
+const char *module_extra_tuningdb_block     (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel);
 u32         module_forced_outfile_format    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 u32         module_hash_category            (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
 const char *module_hash_name                (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
--- a/include/tuningdb.h
+++ b/include/tuningdb.h
@ -17,7 +17,7 @@ int sort_by_tuning_db_entry (const void *v1, const void *v2);
 int  tuning_db_init    (hashcat_ctx_t *hashcat_ctx);
 void tuning_db_destroy (hashcat_ctx_t *hashcat_ctx);

-bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num, const int source);
+bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num);
 tuning_db_entry_t *tuning_db_search (hashcat_ctx_t *hashcat_ctx, const char *device_name, const cl_device_type device_type, int attack_mode, const int hash_mode);

 #endif // HC_TUNINGDB_H
--- a/include/types.h
+++ b/include/types.h
@ -2067,6 +2067,7 @@ typedef struct hm_attrs
  bool threshold_slowdown_get_supported;
  bool throttle_get_supported;
  bool utilization_get_supported;
+  bool memoryused_get_supported;

 } hm_attrs_t;

@ -3013,7 +3014,7 @@ typedef struct module_ctx
  u32         (*module_dgst_size)               (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
  bool        (*module_dictstat_disable)        (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
  u64         (*module_esalt_size)              (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
-  const char *(*module_extra_tuningdb_block)    (const hashconfig_t *, const user_options_t *, const user_options_extra_t *, const backend_ctx_t *, const hashes_t *);
+  const char *(*module_extra_tuningdb_block)    (const hashconfig_t *, const user_options_t *, const user_options_extra_t *, const backend_ctx_t *, const hashes_t *, const u32, const u32);
  u32         (*module_forced_outfile_format)   (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
  u32         (*module_hash_category)           (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
  const char *(*module_hash_name)               (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
--- a/src/backend.c
+++ b/src/backend.c
@ -24,6 +24,7 @@
 #include "dynloader.h"
 #include "backend.h"
 #include "terminal.h"
+#include "hwmon.h"

 #if defined (__linux__)
 static const char *const  dri_card0_path = "/dev/dri/card0";
@ -9649,7 +9650,44 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

    if (module_ctx->module_extra_tuningdb_block != MODULE_DEFAULT)
    {
-      const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes);
+      // We need this because we can't trust CUDA/HIP to give us the real free device memory
+      // The only way to do so is through low level APIs
+
+      for (int i = 0; i < 10; i++)
+      {
+        const u64 used_bytes = hm_get_memoryused_with_devices_idx (hashcat_ctx, device_id);
+
+        if (used_bytes)
+        {
+          if ((used_bytes > (2ULL * 1024 * 1024 * 1024))
+           || (used_bytes > (device_param->device_global_mem * 0.5)))
+          {
+            event_log_warning (hashcat_ctx, "* Device #%u: Memory usage is too high: %" PRIu64 "/%" PRIu64 ", waiting...", device_id + 1, used_bytes, device_param->device_global_mem);
+
+            sleep (1);
+
+            continue;
+          }
+
+          device_param->device_available_mem -= used_bytes;
+
+          break;
+        }
+        else
+        {
+          break;
+        }
+      }
+
+      u32 _kernel_accel = 0;
+
+      tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
+
+      if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
+
+      if (user_options->kernel_accel_chgd == true) _kernel_accel = user_options->kernel_accel;
+
+      const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes, device_id, _kernel_accel);

      char *lines_buf = hcstrdup (extra_tuningdb_block);

@ -9669,7 +9707,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

        if (next[0] == '#') continue;

-        tuning_db_process_line (hashcat_ctx, next, line_num, 2);
+        tuning_db_process_line (hashcat_ctx, next, line_num);

      } while ((next = strtok_r ((char *) NULL, "\n", &saveptr)) != NULL);

--- a/src/ext_nvml.c
+++ b/src/ext_nvml.c
@ -149,6 +149,7 @@ int nvml_init (void *hashcat_ctx)
  HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrentClocksThrottleReasons, NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS, NVML, 0);
  HC_LOAD_FUNC(nvml, nvmlDeviceGetSupportedClocksThrottleReasons, NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS, NVML, 0);
  HC_LOAD_FUNC(nvml, nvmlDeviceGetPciInfo, NVML_DEVICE_GET_PCIINFO, NVML, 0);
+  HC_LOAD_FUNC(nvml, nvmlDeviceGetMemoryInfo, NVML_DEVICE_GET_MEMORYINFO, NVML, 0);

  return 0;
 }
@ -392,3 +393,24 @@ int hm_NVML_nvmlDeviceGetPciInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlPc

  return 0;
 }
+
+int hm_NVML_nvmlDeviceGetMemoryInfo (void *hashcat_ctx, nvmlDevice_t device, nvmlMemory_t *mem)
+{
+  hwmon_ctx_t *hwmon_ctx = ((hashcat_ctx_t *) hashcat_ctx)->hwmon_ctx;
+
+  NVML_PTR *nvml = (NVML_PTR *) hwmon_ctx->hm_nvml;
+
+  const nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetMemoryInfo (device, mem);
+
+  if (nvml_rc != NVML_SUCCESS)
+  {
+    const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
+
+    event_log_error (hashcat_ctx, "nvmlDeviceGetMemoryInfo(): %s", string);
+
+    return -1;
+  }
+
+  return 0;
+}
+
--- a/src/ext_sysfs_amdgpu.c
+++ b/src/ext_sysfs_amdgpu.c
@ -441,3 +441,55 @@ int hm_SYSFS_AMDGPU_get_gpu_busy_percent (void *hashcat_ctx, const int backend_d

  return 0;
 }
+
+int hm_SYSFS_AMDGPU_get_mem_info_vram_used (void *hashcat_ctx, const int backend_device_idx, u64 *val)
+{
+  char *syspath = hm_SYSFS_AMDGPU_get_syspath_device (hashcat_ctx, backend_device_idx);
+
+  if (syspath == NULL) return -1;
+
+  char *path;
+
+  hc_asprintf (&path, "%s/mem_info_vram_used", syspath);
+
+  hcfree (syspath);
+
+  HCFILE fp;
+
+  if (hc_fopen (&fp, path, "r") == false)
+  {
+    event_log_error (hashcat_ctx, "%s: %s", path, strerror (errno));
+
+    hcfree (path);
+
+    return -1;
+  }
+
+  u64 mem_info_vram_used = 0;
+
+  while (!hc_feof (&fp))
+  {
+    char buf[HCBUFSIZ_TINY];
+
+    char *ptr = hc_fgets (buf, sizeof (buf), &fp);
+
+    if (ptr == NULL) continue;
+
+    size_t len = strlen (ptr);
+
+    if (len < 1) continue;
+
+    int rc = sscanf (ptr, "%" PRIu64, &mem_info_vram_used);
+
+    if (rc == 1) break;
+  }
+
+  hc_fclose (&fp);
+
+  *val = mem_info_vram_used;
+
+  hcfree (path);
+
+  return 0;
+}
+
--- a/src/hwmon.c
+++ b/src/hwmon.c
@ -1214,6 +1214,60 @@ int hm_get_throttle_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int back
  return -1;
 }

+u64 hm_get_memoryused_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int backend_device_idx)
+{
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (hwmon_ctx->enabled == false) return 0;
+
+  if (hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported == false) return 0;
+
+  if ((backend_ctx->devices_param[backend_device_idx].is_opencl == true) || (backend_ctx->devices_param[backend_device_idx].is_hip == true) || (backend_ctx->devices_param[backend_device_idx].is_cuda == true))
+  {
+    if (backend_ctx->devices_param[backend_device_idx].opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD) || (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
+      {
+        if (hwmon_ctx->hm_sysfs_amdgpu)
+        {
+          u64 used = 0;
+
+          if (hm_SYSFS_AMDGPU_get_mem_info_vram_used (hashcat_ctx, backend_device_idx, &used) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+            return 0;
+          }
+
+          return used;
+        }
+      }
+
+      if (backend_ctx->devices_param[backend_device_idx].opencl_device_vendor_id == VENDOR_ID_NV)
+      {
+        if (hwmon_ctx->hm_nvml)
+        {
+          nvmlMemory_t mem;
+
+          if (hm_NVML_nvmlDeviceGetMemoryInfo (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].nvml, &mem) == -1)
+          {
+            hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+            return 0;
+          }
+
+          return mem.used;
+        }
+      }
+    }
+  }
+
+  hwmon_ctx->hm_device[backend_device_idx].memoryused_get_supported = false;
+
+  return 0;
+}
+
 int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 {
  bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
@ -1227,12 +1281,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)

  if (bridge_ctx->enabled == true) backend_devices_cnt = 1;

-  #if !defined (WITH_HWMON)
-  return 0;
-  #endif // WITH_HWMON
+  //#if !defined (WITH_HWMON)
+  //return 0;
+  //#endif // WITH_HWMON

  if (user_options->usage          > 0)     return 0;
-  if (user_options->backend_info   > 0)     return 0;
+  //if (user_options->backend_info   > 0)     return 0;

  if (user_options->hash_info     == true)  return 0;
  if (user_options->keyspace      == true)  return 0;
@ -1241,7 +1295,9 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
  if (user_options->stdout_flag   == true)  return 0;
  if (user_options->version       == true)  return 0;
  if (user_options->identify      == true)  return 0;
-  if (user_options->hwmon         == false) return 0;
+  //we need hwmon support to get free memory per device support
+  //its a joke, but there's no way around
+  //if (user_options->hwmon         == false) return 0;

  hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));

@ -1387,6 +1443,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
              hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
              hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
              hm_adapters_nvml[device_id].utilization_get_supported         = true;
+              hm_adapters_nvml[device_id].memoryused_get_supported          = true;
            }
          }
        }
@ -1419,6 +1476,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
              hm_adapters_nvml[device_id].threshold_shutdown_get_supported  = true;
              hm_adapters_nvml[device_id].threshold_slowdown_get_supported  = true;
              hm_adapters_nvml[device_id].utilization_get_supported         = true;
+              hm_adapters_nvml[device_id].memoryused_get_supported          = true;
            }
          }
        }
@ -1640,6 +1698,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
            hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
            hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
            hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
+            hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
          }
        }
      }
@ -1746,6 +1805,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
        hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
        hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
        hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
+        hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_nvml[device_id].memoryused_get_supported;
      }

      if (hwmon_ctx->hm_nvapi)
@ -1875,6 +1935,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
            hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_sysfs_amdgpu[device_id].threshold_slowdown_get_supported;
            hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_sysfs_amdgpu[device_id].throttle_get_supported;
            hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported;
+            hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported;
          }
        }

@ -1895,6 +1956,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
            hwmon_ctx->hm_device[backend_devices_idx].threshold_slowdown_get_supported  |= hm_adapters_nvml[device_id].threshold_slowdown_get_supported;
            hwmon_ctx->hm_device[backend_devices_idx].throttle_get_supported            |= hm_adapters_nvml[device_id].throttle_get_supported;
            hwmon_ctx->hm_device[backend_devices_idx].utilization_get_supported         |= hm_adapters_nvml[device_id].utilization_get_supported;
+            hwmon_ctx->hm_device[backend_devices_idx].memoryused_get_supported          |= hm_adapters_nvml[device_id].memoryused_get_supported;
          }

          if (hwmon_ctx->hm_nvapi)
@ -1927,6 +1989,7 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
    hm_get_threshold_slowdown_with_devices_idx (hashcat_ctx, backend_devices_idx);
    hm_get_throttle_with_devices_idx           (hashcat_ctx, backend_devices_idx);
    hm_get_utilization_with_devices_idx        (hashcat_ctx, backend_devices_idx);
+    hm_get_memoryused_with_devices_idx         (hashcat_ctx, backend_devices_idx);
  }

  FREE_ADAPTERS;
--- a/src/modules/module_08900.c
+++ b/src/modules/module_08900.c
@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,

 static const char *SIGNATURE_SCRYPT = "SCRYPT";

+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
  return kernel_loops_max;
 }

+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -84,26 +93,110 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -112,61 +205,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -527,7 +464,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/modules/module_09300.c
+++ b/src/modules/module_09300.c
@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,

 static const char *SIGNATURE_CISCO9 = "$9$";

+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 1;
 static const u64 SCRYPT_P = 1;
@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
  return kernel_loops_max;
 }

+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -84,26 +93,110 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -112,61 +205,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = req1 * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -488,7 +425,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/modules/module_15700.c
+++ b/src/modules/module_15700.c
@ -56,6 +56,8 @@ typedef struct ethereum_scrypt

 static const char *SIGNATURE_ETHEREUM_SCRYPT = "$ethereum$s";

+static const u32 SCRYPT_THREADS = 4;
+
 static const u64 SCRYPT_N = 262144;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@ -74,9 +76,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
  return kernel_loops_max;
 }

+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 4;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -98,26 +107,110 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -126,61 +219,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -193,115 +234,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -587,7 +524,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/modules/module_22700.c
+++ b/src/modules/module_22700.c
@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,

 static const char *SIGNATURE_MULTIBIT = "$multibit$";

+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
  return kernel_loops_max;
 }

+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -84,26 +93,110 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -112,61 +205,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -526,7 +463,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/modules/module_24000.c
+++ b/src/modules/module_24000.c
@ -57,27 +57,13 @@ typedef struct bestcrypt_scrypt
 // 16 is actually a bit low, we may need to change this depending on user response

 static const char *SIGNATURE_BESTCRYPT_SCRYPT = "$bcve$";
-static const u32   SCRYPT_MAX_ACCEL          = 256;
-static const u32   SCRYPT_MAX_THREADS        = 4;
+
+static const u32 SCRYPT_THREADS = 16;

 static const u64 SCRYPT_N = 32768;
 static const u64 SCRYPT_R = 16;
 static const u64 SCRYPT_P = 1;

-u32 module_kernel_accel_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_accel_min = 1;
-
-  return kernel_accel_min;
-}
-
-u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_accel_max = (user_options->kernel_accel_chgd == true) ? user_options->kernel_accel : SCRYPT_MAX_ACCEL;
-
-  return kernel_accel_max;
-}
-
 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
  const u32 kernel_loops_min = 1;
@ -94,14 +80,14 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_

 u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_min = 1;
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_min;
 }

 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_MAX_THREADS;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -123,26 +109,110 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -151,61 +221,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -215,121 +233,14 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  // we need to set the self-test hash settings to pass the self-test
  // the decoder for the self-test is called after this function

-  const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
-  const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;
+  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
+  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u32 tmto_start = 1;
-  u32 tmto_stop  = 6;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-/*
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-*/
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-/*
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-*/
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks;
-//    + size_pws_pre
-//    + size_pws_base;
-/*
-    + EXTRA_SPACE;
-*/
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u32 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -593,8 +504,8 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_hook_size                = MODULE_DEFAULT;
  module_ctx->module_jit_build_options        = module_jit_build_options;
  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
-  module_ctx->module_kernel_accel_max         = module_kernel_accel_max;
-  module_ctx->module_kernel_accel_min         = module_kernel_accel_min;
+  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
--- a/src/modules/module_27700.c
+++ b/src/modules/module_27700.c
@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,

 static const char *SIGNATURE_MULTIBIT = "$multibit$";

+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
  return kernel_loops_max;
 }

+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -84,26 +93,110 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -112,61 +205,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -179,115 +220,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -550,7 +487,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/modules/module_28200.c
+++ b/src/modules/module_28200.c
@ -57,6 +57,8 @@ typedef struct exodus

 static const char *SIGNATURE_EXODUS = "EXODUS";

+static const u32 SCRYPT_THREADS = 32;
+
 static const u64 SCRYPT_N = 16384;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 1;
@ -75,9 +77,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
  return kernel_loops_max;
 }

+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -96,26 +105,110 @@ u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED
  return esalt_size;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -124,61 +217,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -191,115 +232,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -634,7 +571,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/modules/module_29800.c
+++ b/src/modules/module_29800.c
@ -49,6 +49,8 @@ const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig,

 static const char *SIGNATURE_BISQ = "$bisq$";

+static const u32 SCRYPT_THREADS = 16;
+
 static const u64 SCRYPT_N = 32768;
 static const u64 SCRYPT_R = 8;
 static const u64 SCRYPT_P = 6;
@ -67,9 +69,16 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
  return kernel_loops_max;
 }

+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;
+
+  return kernel_threads_min;
+}
+
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 32;
+  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : SCRYPT_THREADS;

  return kernel_threads_max;
 }
@ -91,26 +100,110 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes)
+u32 tmto = 0;
+
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
 {
+  // preprocess tmto in case user has overridden
+  // it's important to set to 0 otherwise so we can postprocess tmto in that case
+
+  tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
+
  // we enforce the same configuration for all hashes, so this should be fine

  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 req1 = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);
+  const u64 size_per_accel = (128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra)) >> tmto;

  int   lines_sz  = 4096;
  char *lines_buf = hcmalloc (lines_sz);
  int   lines_pos = 0;

-  for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel)
  {
-    hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+    // from command line or tuning db has priority

-    if (device_param->skipped == true) continue;
+    kernel_accel_new = user_options->kernel_accel;
+  }
+  else
+  {
+    // find a nice kernel_accel programmatically

-    const u64 avail = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (2 * req1);
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if ((size_per_accel * device_processors) > available_mem) // not enough memory
+      {
+        const float multi = (float) available_mem / size_per_accel;
+
+        int accel_multi;
+
+        for (accel_multi = 1; accel_multi <= 2; accel_multi++)
+        {
+          kernel_accel_new = multi * (1 << accel_multi);
+
+          if (kernel_accel_new >= device_processors) break;
+        }
+
+        // we need some space for tmps[], ...
+
+        kernel_accel_new -= (1 << accel_multi);
+
+        // clamp if close to device processors -- 10% good?
+
+        if ((kernel_accel_new > device_processors) && ((kernel_accel_new - device_processors) <= (device_processors / 10)))
+        {
+          kernel_accel_new = device_processors;
+        }
+      }
+      else
+      {
+        for (int i = 1; i <= 8; i++)
+        {
+          if ((size_per_accel * device_processors * i) < available_mem)
+          {
+            kernel_accel_new = device_processors * i;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 1; i <= 8; i++)
+      {
+        if ((size_per_accel * device_processors * i) < available_mem)
+        {
+          kernel_accel_new = device_processors * i;
+        }
+      }
+    }
+  }
+
+  // fix tmto if user allows
+
+  if (tmto == 0)
+  {
+    const u32 tmto_start = 1;
+    const u32 tmto_stop  = 5;
+
+    for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
+    {
+      if (available_mem > (kernel_accel_new * (size_per_accel >> tmto_new)))
+      {
+        tmto = tmto_new;
+
+        break;
+      }
+    }
+  }

  char *new_device_name = hcstrdup (device_param->device_name);

@ -119,61 +212,9 @@ const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashco
    if (new_device_name[i] == ' ') new_device_name[i] = '_';
  }

-    char *out_name = new_device_name;
-
-    if (memcmp (new_device_name, "AMD_",    4) == 0) out_name += 4;
-    if (memcmp (new_device_name, "NVIDIA_", 7) == 0) out_name += 7;
-
-    // ok, try to find a nice accel programmatically
-
-    u32 accel = device_param->device_processors;
-
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      // expect to change any of this
-
-      if (avail < (req1 * accel)) // not enough memory
-      {
-        const float multi = (float) avail / req1;
-
-        accel = multi;
-
-        for (int i = 1; i <= 4; i++) // this is tmto
-        {
-          if (device_param->device_processors > accel)
-          {
-            accel = ((u64) multi << i) & ~3;
-          }
-        }
-      }
-      else
-      {
-        for (int i = 1; i <= 8; i++)
-        {
-          if ((avail * 2) > (req1 * accel))
-          {
-            accel = device_param->device_processors * i;
-          }
-        }
-      }
-    }
-    else
-    {
-      const u64 req1 = 128 * scrypt_r * scrypt_N;
-
-      for (int i = 1; i <= 8; i++)
-      {
-        if (avail > (req1 * accel))
-        {
-          accel = device_param->device_processors * i;
-        }
-      }
-    }
-
-    lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", out_name, user_options->hash_mode, accel);
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);

  hcfree (new_device_name);
-  }

  return lines_buf;
 }
@ -186,115 +227,11 @@ u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
  const u64 scrypt_N = (hashes->salts_buf[0].scrypt_N) ? hashes->salts_buf[0].scrypt_N : SCRYPT_N;
  const u64 scrypt_r = (hashes->salts_buf[0].scrypt_r) ? hashes->salts_buf[0].scrypt_r : SCRYPT_R;

-  const u64 kernel_power_max = ((OPTS_TYPE & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
+  const u64 size_per_accel = 128 * scrypt_r * scrypt_N * module_kernel_threads_max (hashconfig, user_options, user_options_extra);

-  u64 tmto_start = 0;
-  u64 tmto_stop  = 4;
+  u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;

-  if (user_options->scrypt_tmto_chgd == true)
-  {
-    tmto_start = user_options->scrypt_tmto;
-    tmto_stop  = user_options->scrypt_tmto;
-  }
-
-  // size_pws
-
-  const u64 size_pws = kernel_power_max * sizeof (pw_t);
-
-  const u64 size_pws_amp = size_pws;
-
-  // size_pws_comp
-
-  const u64 size_pws_comp = kernel_power_max * (sizeof (u32) * 64);
-
-  // size_pws_idx
-
-  const u64 size_pws_idx = (kernel_power_max + 1) * sizeof (pw_idx_t);
-
-  // size_tmps
-
-  const u64 size_tmps = kernel_power_max * hashconfig->tmp_size;
-
-  // size_hooks
-
-  const u64 size_hooks = kernel_power_max * hashconfig->hook_size;
-
-  u64 size_pws_pre  = 4;
-  u64 size_pws_base = 4;
-
-  if (user_options->slow_candidates == true)
-  {
-    // size_pws_pre
-
-    size_pws_pre = kernel_power_max * sizeof (pw_pre_t);
-
-    // size_pws_base
-
-    size_pws_base = kernel_power_max * sizeof (pw_pre_t);
-  }
-
-  // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
-  // let's add some extra space just to be sure.
-  // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
-
-  u64 EXTRA_SPACE = (1024ULL * 1024ULL) * device_param->kernel_accel_max;
-
-  EXTRA_SPACE = MAX (EXTRA_SPACE, (  64ULL * 1024ULL * 1024ULL));
-  EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-  const u64 scrypt_extra_space
-    = device_param->size_bfs
-    + device_param->size_combs
-    + device_param->size_digests
-    + device_param->size_esalts
-    + device_param->size_markov_css
-    + device_param->size_plains
-    + device_param->size_results
-    + device_param->size_root_css
-    + device_param->size_rules
-    + device_param->size_rules_c
-    + device_param->size_salts
-    + device_param->size_shown
-    + device_param->size_tm
-    + device_param->size_st_digests
-    + device_param->size_st_salts
-    + device_param->size_st_esalts
-    + size_pws
-    + size_pws_amp
-    + size_pws_comp
-    + size_pws_idx
-    + size_tmps
-    + size_hooks
-    + size_pws_pre
-    + size_pws_base
-    + EXTRA_SPACE;
-
-  bool not_enough_memory = true;
-
-  u64 size_scrypt = 0;
-
-  u64 tmto;
-
-  for (tmto = tmto_start; tmto <= tmto_stop; tmto++)
-  {
-    size_scrypt = (128ULL * scrypt_r) * scrypt_N;
-
-    size_scrypt /= 1ull << tmto;
-
-    size_scrypt *= kernel_power_max;
-
-    if ((size_scrypt / 4) > device_param->device_maxmem_alloc) continue;
-
-    if ((size_scrypt + scrypt_extra_space) > device_param->device_available_mem) continue;
-
-    not_enough_memory = false;
-
-    break;
-  }
-
-  if (not_enough_memory == true) return -1;
-
-  return size_scrypt;
+  return size_scrypt / (1 << tmto);
 }

 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -557,7 +494,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/tuningdb.c
+++ b/src/tuningdb.c
@ -43,11 +43,6 @@ int sort_by_tuning_db_entry (const void *v1, const void *v2)

  if (res3 != 0) return (res3);

-  const int res4 = t1->source
-                 - t2->source;
-
-  if (res4 != 0) return (res4);
-
  return 0;
 }

@ -118,7 +113,7 @@ int tuning_db_init (hashcat_ctx_t *hashcat_ctx)

      if (line_buf[0] == '#') continue;

-      tuning_db_process_line (hashcat_ctx, line_buf, line_num, 1);
+      tuning_db_process_line (hashcat_ctx, line_buf, line_num);
    }

    hcfree (buf);
@ -167,7 +162,7 @@ void tuning_db_destroy (hashcat_ctx_t *hashcat_ctx)
  memset (tuning_db, 0, sizeof (tuning_db_t));
 }

-bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num, const int source)
+bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num)
 {
  tuning_db_t           *tuning_db          = hashcat_ctx->tuning_db;
  user_options_extra_t  *user_options_extra = hashcat_ctx->user_options_extra;
@ -353,7 +348,6 @@ bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, c
    entry->vector_width = vector_width;
    entry->kernel_accel = kernel_accel;
    entry->kernel_loops = kernel_loops;
-    entry->source       = source;

    tuning_db->entry_cnt++;
  }
@ -430,12 +424,11 @@ static tuning_db_entry_t *tuning_db_search_real (hashcat_ctx_t *hashcat_ctx, con

  // this will produce all 2^3 combinations required

-  for (i = 0; i < 16; i++)
+  for (i = 0; i < 8; i++)
  {
-    s.source      = (i & 1) ?   2 : 1;
+    s.device_name = (i & 1) ? "*" : device_name_nospace;
    s.attack_mode = (i & 2) ?  -1 : attack_mode;
    s.hash_mode   = (i & 4) ?  -1 : hash_mode;
-    s.device_name = (i & 8) ? "*" : device_name_nospace;

    entry = (tuning_db_entry_t *) bsearch (&s, tuning_db->entry_buf, tuning_db->entry_cnt, sizeof (tuning_db_entry_t), sort_by_tuning_db_entry);

@ -443,7 +436,7 @@ static tuning_db_entry_t *tuning_db_search_real (hashcat_ctx_t *hashcat_ctx, con

    // in non-wildcard mode do some additional checks:

-    if ((i & 8) == 0)
+    if ((i & 1) == 0)
    {
      // in case we have an alias-name

--- a/src/user_options.c
+++ b/src/user_options.c
@ -1895,6 +1895,14 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
  }
  #endif

+  if (user_options->hwmon == false)
+  {
+    // some algorithm, such as SCRYPT, depend on accurate free memory values
+    // the only way to get them is through low-level APIs such as nvml via hwmon
+
+    user_options->hwmon = true;
+  }
+
  if (user_options->stdout_flag)
  {
    user_options->hwmon               = false;
--- a/tunings/Module_08900.hctune
+++ b/tunings/Module_08900.hctune
@ -24,4 +24,3 @@
 # It's better to derive the tuning based on the hash information (handled by the hash-mode plugin).
 # The tunings from the hash-mode plugin may be slightly off, so if you have better values, you can hardcode them here.

-
--- a/tunings/Module_09300.hctune
+++ b/tunings/Module_09300.hctune
@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops

-GeForce_RTX_4090                                *       9300    1       512     A
-ALIAS_AMD_RX6900XT                              *       9300    1       720     A
-ALIAS_AMD_RX7900XTX                             *       9300    1       840     A
-
--- a/tunings/Module_15700.hctune
+++ b/tunings/Module_15700.hctune
@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops

-GeForce_RTX_4090                                *       15700   1       180     A
-ALIAS_AMD_RX6900XT                              *       15700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       15700   1       92      A
-
--- a/tunings/Module_22700.hctune
+++ b/tunings/Module_22700.hctune
@ -19,7 +19,14 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops

-GeForce_RTX_4090                                *       22700   1       180     A
-ALIAS_AMD_RX6900XT                              *       22700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       22700   1       92      A
+#Leaving this here as a reference
+#GeForce_GTX_980                                 *       22700   1       28      A
+#GeForce_GTX_1630                                *       22700   1       11      A
+#GeForce_RTX_2080_Ti                             *       22700   1       78      A
+#GeForce_RTX_3090                                *       22700   1       82      A
+#GeForce_RTX_4090                                *       22700   1       180     A
+#ALIAS_AMD_RX480                                 *       22700   1       28      A
+#ALIAS_AMD_Vega64                                *       22700   1       28      A
+#ALIAS_AMD_RX6900XT                              *       22700   1       56      A
+#ALIAS_AMD_RX7900XTX                             *       22700   1       92      A

--- a/tunings/Module_24000.hctune
+++ b/tunings/Module_24000.hctune
@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops

-GeForce_RTX_4090                                *       24000   1       180     A
-ALIAS_AMD_RX6900XT                              *       24000   1       56      A
-ALIAS_AMD_RX7900XTX                             *       24000   1       92      A
-
--- a/tunings/Module_27700.hctune
+++ b/tunings/Module_27700.hctune
@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops

-GeForce_RTX_4090                                *       27700   1       180     A
-ALIAS_AMD_RX6900XT                              *       27700   1       56      A
-ALIAS_AMD_RX7900XTX                             *       27700   1       92      A
-
--- a/tunings/Module_28200.hctune
+++ b/tunings/Module_28200.hctune
@ -19,7 +19,3 @@
 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops

-GeForce_RTX_4090                                *       28200   1       180     A
-ALIAS_AMD_RX6900XT                              *       28200   1       56      A
-ALIAS_AMD_RX7900XTX                             *       28200   1       92      A
-
--- a/tunings/Module_29800.hctune
+++ b/tunings/Module_29800.hctune
@ -18,8 +18,3 @@

 #Device                                         Attack  Hash    Vector  Kernel  Kernel
 #Name                                           Mode    Type    Width   Accel   Loops
-
-GeForce_RTX_4090                                *       29800   1       180     A
-ALIAS_AMD_RX6900XT                              *       29800   1       56      A
-ALIAS_AMD_RX7900XTX                             *       29800   1       92      A
-