diff --git a/src/autotune.c b/src/autotune.c
index cdeb73553..d6c406513 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -10,7 +10,7 @@
 #include "status.h"
 #include "autotune.h"
 
-static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
+static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads)
 {
   hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
   user_options_t *user_options = hashcat_ctx->user_options;
@@ -19,7 +19,9 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
   device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set
   device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
 
-  u32 kernel_power_try = device_param->hardware_power * kernel_accel;
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads;
+
+  u32 kernel_power_try = hardware_power * kernel_accel;
 
   if (user_options->attack_mode == ATTACK_MODE_ASSOCIATION)
   {
@@ -33,6 +35,10 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
     }
   }
 
+  const u32 kernel_threads_sav = device_param->kernel_threads;
+
+  device_param->kernel_threads = kernel_threads;
+
   const double spin_damp_sav = device_param->spin_damp;
 
   device_param->spin_damp = 0;
@@ -50,71 +56,51 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
   }
   else
   {
-    run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_try, true, 0);
-
-    if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
-    {
-      run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_try, true, 0);
-    }
-
     run_kernel (hashcat_ctx, device_param, KERN_RUN_2, 0, kernel_power_try, true, 0);
   }
 
   device_param->spin_damp = spin_damp_sav;
 
+  device_param->kernel_threads = kernel_threads_sav;
+
   const double exec_msec_prev = get_avg_exec_time (device_param, 1);
 
   return exec_msec_prev;
 }
 
-/*
-static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
+static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads, const int times)
 {
-  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
-
-  device_param->kernel_params_buf32[28] = 0;
-  device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set
-  device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
+  double exec_msec_best = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
 
-  const u32 kernel_power_try = device_param->hardware_power * kernel_accel;
-
-  const u32 kernel_threads_sav = device_param->kernel_threads;
-
-  const double spin_damp_sav = device_param->spin_damp;
-
-  device_param->spin_damp = 0;
-
-  if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+  for (int i = 1; i < times; i++)
   {
-    if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-    {
-      device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple1;
+    double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
 
-      run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_try, true, 0);
-    }
-    else
-    {
-      device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple4;
+    if (exec_msec > exec_msec_best) continue;
 
-      run_kernel (hashcat_ctx, device_param, KERN_RUN_4, 0, kernel_power_try, true, 0);
-    }
+    exec_msec_best = exec_msec;
   }
-  else
-  {
-    device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple2;
 
-    run_kernel (hashcat_ctx, device_param, KERN_RUN_2, 0, kernel_power_try, true, 0);
-  }
+  return exec_msec_best;
+}
 
-  device_param->kernel_threads = kernel_threads_sav;
+static u32 previous_power_of_two (const u32 x)
+{
+  // https://stackoverflow.com/questions/2679815/previous-power-of-2
+  // really cool!
 
-  device_param->spin_damp = spin_damp_sav;
+  if (x == 0) return 0;
 
-  const double exec_msec_prev = get_avg_exec_time (device_param, 1);
+  u32 r = x;
 
-  return exec_msec_prev;
+  r |= (r >>  1);
+  r |= (r >>  2);
+  r |= (r >>  4);
+  r |= (r >>  8);
+  r |= (r >> 16);
+
+  return r - (r >> 1);
 }
-*/
 
 static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
@@ -131,9 +117,57 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   const u32 kernel_loops_min = device_param->kernel_loops_min;
   const u32 kernel_loops_max = device_param->kernel_loops_max;
 
+  const u32 kernel_threads_min = device_param->kernel_threads_min;
+  const u32 kernel_threads_max = device_param->kernel_threads_max;
+
   u32 kernel_accel = kernel_accel_min;
   u32 kernel_loops = kernel_loops_min;
 
+  // for the threads we take as initial value what we receive from the runtime
+  // but is only to start with something, we will fine tune this value as soon as we have our workload specified
+  // this thread limiting is also performed insinde run_kernel() so we need to redo it here, too
+
+  u32 kernel_wgs = 0;
+  u32 kernel_wgs_multiple = 0;
+
+  if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+  {
+    if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+    {
+      kernel_wgs = device_param->kernel_wgs1;
+
+      kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple1;
+    }
+    else
+    {
+      kernel_wgs = device_param->kernel_wgs4;
+
+      kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple4;
+    }
+  }
+  else
+  {
+    kernel_wgs = device_param->kernel_wgs2;
+
+    kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple2;
+  }
+
+  u32 kernel_threads = kernel_threads_max;
+
+  if ((kernel_wgs >= kernel_threads_min) && (kernel_wgs <= kernel_threads_max))
+  {
+    kernel_threads = kernel_wgs;
+  }
+
+  // having a value power of 2 makes it easier to divide
+
+  const u32 kernel_threads_p2 = previous_power_of_two (kernel_threads);
+
+  if ((kernel_threads_p2 >= kernel_threads_min) && (kernel_threads_p2 <= kernel_threads_max))
+  {
+    kernel_threads = kernel_threads_p2;
+  }
+
   // in this case the user specified a fixed -n and -u on the commandline
   // no way to tune anything
   // but we need to run a few caching rounds
@@ -149,10 +183,10 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     if (hashconfig->warmup_disable == false)
     {
-      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
-      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
-      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
-      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
+      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
+      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
+      try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
     }
 
     #endif
@@ -222,13 +256,37 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       }
     }
 
+    // we also need to initialize some values using kernels
+
+    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      // nothing to do
+    }
+    else
+    {
+      const u32 kernel_threads_sav = device_param->kernel_threads;
+
+      device_param->kernel_threads = device_param->kernel_wgs1;
+
+      run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_max, false, 0);
+
+      if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
+      {
+        device_param->kernel_threads = device_param->kernel_wgs2p;
+
+        run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_max, false, 0);
+      }
+
+      device_param->kernel_threads = kernel_threads_sav;
+    }
+
     // Do a pre-autotune test run to find out if kernel runtime is above some TDR limit
 
     u32 kernel_loops_max_reduced = kernel_loops_max;
 
     if (true)
     {
-      double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min);
+      double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads);
 
       if (exec_msec > 2000)
       {
@@ -237,7 +295,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
         return -1;
       }
 
-      exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min);
+      exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads);
 
       const u32 mm = kernel_loops_max / kernel_loops_min;
 
@@ -257,16 +315,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       {
         if (kernel_loops > kernel_loops_max_reduced) continue;
 
-        double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops);
+        double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops, kernel_threads, 1);
 
         if (exec_msec < target_msec) break;
       }
     }
 
-    // now the same for kernel-accel but with the new kernel-loops from previous loop set
-
     #define STEPS_CNT 16
 
+    // now the same for kernel-accel but with the new kernel-loops from previous loop set
+
     if (kernel_accel_min < kernel_accel_max)
     {
       for (int i = 0; i < STEPS_CNT; i++)
@@ -276,7 +334,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
         if (kernel_accel_try < kernel_accel_min) continue;
         if (kernel_accel_try > kernel_accel_max) break;
 
-        double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops);
+        double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads, 1);
 
         if (exec_msec > target_msec) break;
 
@@ -292,7 +350,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       const u32 kernel_accel_orig = kernel_accel;
       const u32 kernel_loops_orig = kernel_loops;
 
-      double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+      double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1);
 
       for (int i = 1; i < STEPS_CNT; i++)
       {
@@ -307,7 +365,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         // do a real test
 
-        const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try);
+        const double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try, kernel_threads, 1);
 
         if (exec_msec_prev < exec_msec) break;
 
@@ -324,7 +382,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       }
     }
 
-    double exec_msec_pre_final = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+    double exec_msec_pre_final = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1);
 
     const u32 exec_left = (const u32) (target_msec / exec_msec_pre_final);
 
@@ -339,46 +397,43 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       kernel_accel *= exec_accel_min;
     }
 
-    // start finding best thread count is easier.
-    // it's either the preferred or the maximum thread count
+    // v6.2.4 new section: find thread count
+    // This is not as effective as it could be because of inaccurate kernel return timers
+    // But is better than fixed values
+    // Timers in this section are critical, so we rerun meassurements 3 times
 
-    /*
-    const u32 kernel_threads_min = device_param->kernel_threads_min;
-    const u32 kernel_threads_max = device_param->kernel_threads_max;
-
-    if (kernel_threads_min < kernel_threads_max)
+    if (kernel_threads_max > kernel_threads_min)
     {
-      const double exec_msec_max = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+      const u32 kernel_accel_orig   = kernel_accel;
+      const u32 kernel_threads_orig = kernel_threads;
 
-      u32 preferred_threads = 0;
+      double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 3);
 
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
-      {
-        if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
-        {
-          preferred_threads = device_param->kernel_preferred_wgs_multiple1;
-        }
-        else
-        {
-          preferred_threads = device_param->kernel_preferred_wgs_multiple4;
-        }
-      }
-      else
+      for (int i = 1; i < STEPS_CNT; i++)
       {
-        preferred_threads = device_param->kernel_preferred_wgs_multiple2;
-      }
+        const u32 kernel_accel_try   = kernel_accel_orig   * (1U << i);
+        const u32 kernel_threads_try = kernel_threads_orig / (1U << i);
 
-      if ((preferred_threads >= kernel_threads_min) && (preferred_threads <= kernel_threads_max))
-      {
-        const double exec_msec_preferred = try_run_preferred (hashcat_ctx, device_param, kernel_accel, kernel_loops);
+        // since we do not modify total amount of workitems, we can (and need) to do increase kernel_accel_max
 
-        if (exec_msec_preferred < exec_msec_max)
-        {
-          device_param->kernel_threads = preferred_threads;
-        }
+        const u32 kernel_accel_max_try = kernel_accel_max * (1U << i);
+
+        if (kernel_accel_try > kernel_accel_max_try) break;
+
+        if (kernel_threads_try < kernel_threads_min) break;
+
+        if (kernel_threads_try % kernel_wgs_multiple) break; // this would just be waste of time
+
+        double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads_try, 3);
+
+        if (exec_msec > exec_msec_prev) continue;
+
+        exec_msec_prev = exec_msec;
+
+        kernel_accel   = kernel_accel_try;
+        kernel_threads = kernel_threads_try;
       }
     }
-    */
   }
 
   // reset them fake words
@@ -478,8 +533,13 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   // store
 
-  device_param->kernel_accel = kernel_accel;
-  device_param->kernel_loops = kernel_loops;
+  device_param->kernel_accel   = kernel_accel;
+  device_param->kernel_loops   = kernel_loops;
+  device_param->kernel_threads = kernel_threads;
+
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads;
+
+  device_param->hardware_power = hardware_power;
 
   const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel;
 
diff --git a/src/backend.c b/src/backend.c
index 05cdeeac4..e95793c4f 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -5420,6 +5420,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
     if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event2) == -1) return -1;
 
+    if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event1) == -1) return -1;
+
     float exec_ms;
 
     if (hc_cuEventElapsedTime (hashcat_ctx, &exec_ms, device_param->cuda_event1, device_param->cuda_event2) == -1) return -1;
@@ -9095,6 +9097,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           }
         }
 
+        if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+        {
+          // they like this
+
+          device_param->kernel_preferred_wgs_multiple = 1;
+        }
+
         if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
         {
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
@@ -10113,61 +10122,6 @@ static int get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx,
   return 0;
 }
 
-static u32 get_kernel_threads (const hc_device_param_t *device_param)
-{
-  // this is an upper limit, a good start, since our strategy is to reduce thread counts only.
-
-  u32 kernel_threads_min = device_param->kernel_threads_min;
-  u32 kernel_threads_max = device_param->kernel_threads_max;
-
-  // the changes we do here are just optimizations, since the module always has priority.
-
-  const u32 device_maxworkgroup_size = (const u32) device_param->device_maxworkgroup_size;
-
-  kernel_threads_max = MIN (kernel_threads_max, device_maxworkgroup_size);
-
-  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
-  {
-    // for all CPU we just do 1 ...
-
-    kernel_threads_max = MIN (kernel_threads_max, 1);
-  }
-  else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-  {
-    // for GPU we need to distinguish by vendor
-
-    if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
-    {
-      kernel_threads_max = MIN (kernel_threads_max, 8);
-    }
-    else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
-    {
-      if (device_param->kernel_preferred_wgs_multiple == 64)
-      {
-        // only older AMD GPUs with WaveFront size 64 benefit from this
-
-        kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
-      }
-    }
-    else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP)
-    {
-      if (device_param->kernel_preferred_wgs_multiple == 64)
-      {
-        // only older AMD GPUs with WaveFront size 64 benefit from this
-
-        kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
-      }
-    }
-  }
-
-  // this is intenionally! at this point, kernel_threads_min can be higher than kernel_threads_max.
-  // in this case we actually want kernel_threads_min selected.
-
-  const u32 kernel_threads = MAX (kernel_threads_min, kernel_threads_max);
-
-  return kernel_threads;
-}
-
 static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const char *kernel_name, char *source_file, char *cached_file, const char *build_options_buf, const bool cache_disable, cl_program *opencl_program, CUmodule *cuda_module, hipModule_t *hip_module)
 {
   const hashconfig_t    *hashconfig    = hashcat_ctx->hashconfig;
@@ -10206,8 +10160,6 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
   if (cached == false)
   {
     #if defined (DEBUG)
-    const user_options_t *user_options = hashcat_ctx->user_options;
-
     if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache. Please be patient...", device_param->device_id + 1, filename_from_filepath (cached_file));
     #endif
 
@@ -10460,7 +10412,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
       //hiprtc_options[1] = "--device-as-default-execution-space";
       //hiprtc_options[2] = "--gpu-architecture";
 
-      hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX));
+      hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
 
       hiprtc_options[1] = "-nocudainc";
       hiprtc_options[2] = "-nocudalib";
@@ -11106,6 +11058,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
+    // this seems to work always
+
+    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+    {
+      u32 native_threads = 1;
+
+      if ((native_threads >= device_param->kernel_threads_min) && (native_threads <= device_param->kernel_threads_max))
+      {
+        device_param->kernel_threads_min = native_threads;
+        device_param->kernel_threads_max = native_threads;
+      }
+    }
+
     /**
      * create context for each device
      */
@@ -11423,7 +11388,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       device_param->device_name,
       device_param->opencl_device_version,
       device_param->opencl_driver_version,
-      (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX));
+      (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
 
     md5_ctx_t md5_ctx;
 
@@ -11758,7 +11723,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         device_param->vector_width,
         hashconfig->kern_type,
         extra_value,
-        (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX),
+        (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max,
         build_options_module_buf);
 
       md5_ctx_t md5_ctx;
@@ -14483,7 +14448,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      * now everything that depends on threads and accel, basically dynamic workload
      */
 
-    u32 kernel_threads = get_kernel_threads (device_param);
+    //    u32 kernel_threads = get_kernel_threads (device_param);
 
     if (user_options->attack_mode == ATTACK_MODE_ASSOCIATION)
     {
@@ -14491,12 +14456,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       // in autotune. in this attack mode kernel_power is limited by salts_cnt so we
       // do not have a lot of options left.
 
-      kernel_threads = MIN (kernel_threads, 64);
+      device_param->kernel_threads_min = MIN (device_param->kernel_threads_min, 64);
+      device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64);
     }
 
-    device_param->kernel_threads = kernel_threads;
+    //    device_param->kernel_threads = kernel_threads;
+    device_param->kernel_threads = 0;
 
-    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * kernel_threads;
+    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * device_param->kernel_threads_max;
 
     u32 kernel_accel_min = device_param->kernel_accel_min;
     u32 kernel_accel_max = device_param->kernel_accel_max;
@@ -14622,6 +14589,47 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if ((size_tmps  + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
       if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
 
+      // work around, for some reason apple opencl can't have buffers larger 2^31
+      // typically runs into trap 6
+      // maybe 32/64 bit problem affecting size_t?
+
+      if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
+      {
+        const size_t undocumented_single_allocation_apple = 0x7fffffff;
+
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_bfs                > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_combs              > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_digests            > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_esalts             > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_hooks              > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_markov_css         > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_plains             > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_pws                > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_pws_amp            > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_pws_comp           > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_pws_idx            > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_results            > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_root_css           > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_rules              > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_rules_c            > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_salts              > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_extra_buffer       > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_shown              > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_tm                 > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_tmps               > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_st_digests         > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_st_salts           > undocumented_single_allocation_apple) memory_limit_hit = 1;
+        if (size_st_esalts          > undocumented_single_allocation_apple) memory_limit_hit = 1;
+      }
+
       const u64 size_total
         = bitmap_ctx->bitmap_size
         + bitmap_ctx->bitmap_size
diff --git a/src/modules/module_01500.c b/src/modules/module_01500.c
index dc7b7b47e..ca0e90232 100644
--- a/src/modules/module_01500.c
+++ b/src/modules/module_01500.c
@@ -95,13 +95,6 @@ int module_build_plain_postprocess (MAYBE_UNUSED const hashconfig_t *hashconfig,
   return src_len;
 }
 
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_max = 64; // performance only optimization
-
-  return kernel_threads_max;
-}
-
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   u32 kernel_loops_max = KERNEL_LOOPS_MAX;
@@ -345,7 +338,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
diff --git a/src/modules/module_03000.c b/src/modules/module_03000.c
index c9b616ab5..cc091a5e5 100644
--- a/src/modules/module_03000.c
+++ b/src/modules/module_03000.c
@@ -96,13 +96,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   return jit_build_options;
 }
 
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_max = 64; // performance only optimization
-
-  return kernel_threads_max;
-}
-
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   u32 kernel_loops_max = KERNEL_LOOPS_MAX;
@@ -269,7 +262,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
diff --git a/src/modules/module_12500.c b/src/modules/module_12500.c
index fda848e0a..a095b8f78 100644
--- a/src/modules/module_12500.c
+++ b/src/modules/module_12500.c
@@ -88,15 +88,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  // -T 128 works slightly faster but it's free for the user to change
-
-  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128;
-
-  return kernel_threads_max;
-}
-
 u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL);
@@ -256,7 +247,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
diff --git a/src/modules/module_14000.c b/src/modules/module_14000.c
index 013888bcc..abe8d259e 100644
--- a/src/modules/module_14000.c
+++ b/src/modules/module_14000.c
@@ -91,13 +91,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
   return jit_build_options;
 }
 
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_max = 64; // performance only optimization
-
-  return kernel_threads_max;
-}
-
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   u32 kernel_loops_max = KERNEL_LOOPS_MAX;
@@ -300,7 +293,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
diff --git a/src/modules/module_14600.c b/src/modules/module_14600.c
index 0ff26ac67..755fef2ff 100644
--- a/src/modules/module_14600.c
+++ b/src/modules/module_14600.c
@@ -179,17 +179,6 @@ typedef struct luks_tmp
 
 } luks_tmp_t;
 
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  // the module requires a lot of registers for key schedulers on _comp kernel.
-  // it's possible, if using too many threads, there's not enough registers available, typically ending with misleading error message:
-  // cuLaunchKernel(): out of memory
-
-  const u32 kernel_threads_max = 64;
-
-  return kernel_threads_max;
-}
-
 void *module_benchmark_esalt (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   luks_t *luks = (luks_t *) hcmalloc (sizeof (luks_t));
@@ -655,7 +644,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = module_kern_type_dynamic;
diff --git a/src/modules/module_23700.c b/src/modules/module_23700.c
index d3803b0eb..26a24c16f 100644
--- a/src/modules/module_23700.c
+++ b/src/modules/module_23700.c
@@ -105,15 +105,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  // -T 128 works slightly faster but it's free for the user to change
-
-  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128;
-
-  return kernel_threads_max;
-}
-
 u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL);
@@ -377,7 +368,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
diff --git a/src/modules/module_23800.c b/src/modules/module_23800.c
index f910b361f..1ca597082 100644
--- a/src/modules/module_23800.c
+++ b/src/modules/module_23800.c
@@ -388,15 +388,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_
   return kernel_loops_max;
 }
 
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  // -T 128 works slightly faster but it's free for the user to change
-
-  const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128;
-
-  return kernel_threads_max;
-}
-
 u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL);
@@ -654,7 +645,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
   module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
diff --git a/src/selftest.c b/src/selftest.c
index 4f8dc1092..2e0b5b000 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -453,6 +453,10 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   // main : run the kernel
 
+  const u32 kernel_threads_sav = device_param->kernel_threads;
+
+  device_param->kernel_threads = device_param->kernel_threads_min;
+
   const double spin_damp_sav = device_param->spin_damp;
 
   device_param->spin_damp = 0;
@@ -677,6 +681,8 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   device_param->spin_damp = spin_damp_sav;
 
+  device_param->kernel_threads = kernel_threads_sav;
+
   // check : check if cracked
 
   u32 num_cracked = 0;
diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl
index ba2db8856..a481a87e9 100755
--- a/tools/benchmark_deep.pl
+++ b/tools/benchmark_deep.pl
@@ -409,7 +409,6 @@ for my $hash_type (@hash_types)
     "--wordlist-autohex-disable",
     "--potfile-disable",
     "--logfile-disable",
-    "--hwmon-disable",
     "--status",
     "--status-timer", 1,
     "--runtime", $runtime,