diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index d3a1622a0..8034c4e0e 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -61724,7 +61724,8 @@ __kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max)
   pw.i[62] = 0;
   pw.i[63] = 0; // yep that's faster
 
-  pw.pw_len = 1 + (l32 & 15);
+  //pw.pw_len = 1 + (l32 & 15);
+  pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500)
 
   buf[gid] = pw;
 }
diff --git a/docs/changes.txt b/docs/changes.txt
index cac918b76..2388de6d0 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -63,6 +63,7 @@
 - OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
 - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
 - OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
+- OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count
 - OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
 - OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
 - OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
diff --git a/include/shared.h b/include/shared.h
index e5b74b621..0f8e38eef 100644
--- a/include/shared.h
+++ b/include/shared.h
@@ -61,4 +61,7 @@ bool hc_same_files (char *file1, char *file2);
 u32 hc_strtoul  (const char *nptr, char **endptr, int base);
 u64 hc_strtoull (const char *nptr, char **endptr, int base);
 
+u32 power_of_two_ceil_32  (const u32 v);
+u32 power_of_two_floor_32 (const u32 v);
+
 #endif // _SHARED_H
diff --git a/include/types.h b/include/types.h
index f51604fe4..32baedd1d 100644
--- a/include/types.h
+++ b/include/types.h
@@ -138,14 +138,19 @@ typedef enum amplifier_count
   KERNEL_BFS                        = 1024,
   KERNEL_COMBS                      = 1024,
   KERNEL_RULES                      = 256,
-  KERNEL_THREADS_MAX_CPU            = 1,
-  KERNEL_THREADS_MAX_GPU            = 8,  // ex: intel integrated
-  KERNEL_THREADS_MAX_GPU_NV         = 32, // optimized NV  size: warps
-  KERNEL_THREADS_MAX_GPU_AMD        = 64, // optimized AMD size: wavefronts
-  KERNEL_THREADS_MAX_OTHER          = 8,  // ex: intel MIC
 
 } amplifier_count_t;
 
+typedef enum native_threads
+{
+  KERNEL_THREADS_NATIVE_CPU         = 1,
+  KERNEL_THREADS_NATIVE_GPU         = 8,  // ex: intel integrated
+  KERNEL_THREADS_NATIVE_GPU_NV      = 32, // optimized NV  size: warps
+  KERNEL_THREADS_NATIVE_GPU_AMD     = 64, // optimized AMD size: wavefronts
+  KERNEL_THREADS_NATIVE_OTHER       = 8,  // ex: intel MIC
+
+} native_threads_t;
+
 typedef enum vendor_id
 {
   VENDOR_ID_AMD           = (1 << 0),
@@ -923,6 +928,7 @@ typedef struct hc_device_param
   u64     device_global_mem;
   u32     device_maxclock_frequency;
   size_t  device_maxworkgroup_size;
+  u64     device_local_mem_size;
 
   u32     vector_width;
 
diff --git a/src/hashes.c b/src/hashes.c
index b50a25e72..b5f1e2cde 100644
--- a/src/hashes.c
+++ b/src/hashes.c
@@ -351,11 +351,11 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
     return -1;
   }
 
-  // we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
-  // but sometimes, when a benchmark kernel run cracks a kernel, we don't want to see that!
-
   if (user_options->speed_only == true)
   {
+    // we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
+    // however if the benchmark cracks the artificial hash used for benchmarks we don't want to see that!
+
     return 0;
   }
 
diff --git a/src/interface.c b/src/interface.c
index 8fba39748..d49ae2c72 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -25970,6 +25970,66 @@ u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx)
   return kernel_threads;
 }
 
+u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
+{
+  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+
+  u32 kernel_threads = 0;
+
+  // sometimes there's a high kernel requirement for local memory (which is multiplied with threads)
+
+  u32 local_mem_per_thread = 0;
+
+  // basically the sum of all .local space of the _loop kernel
+  // see .ptx
+
+  if (hashconfig->hash_mode ==  1800) local_mem_per_thread = 1024;
+  if (hashconfig->hash_mode == 12500) local_mem_per_thread = 3296;
+  if (hashconfig->hash_mode == 13400) local_mem_per_thread = 5360;
+
+  if (local_mem_per_thread)
+  {
+    const u32 device_local_mem_size = (const u32) device_param->device_local_mem_size;
+
+    kernel_threads = device_local_mem_size / local_mem_per_thread;
+
+    // there can be some very unaligned results from this, therefore round it down to next power of two
+
+    kernel_threads = power_of_two_floor_32 (kernel_threads);
+  }
+
+  // make sure to not underpower
+
+  if (kernel_threads)
+  {
+    if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+    {
+      kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
+    }
+    else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
+    {
+      if (device_param->device_vendor_id == VENDOR_ID_NV)
+      {
+        kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_NV);
+      }
+      else if (device_param->device_vendor_id == VENDOR_ID_AMD)
+      {
+        kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_AMD);
+      }
+      else
+      {
+        kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU);
+      }
+    }
+    else
+    {
+      kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_OTHER);
+    }
+  }
+
+  return kernel_threads;
+}
+
 u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
@@ -25980,13 +26040,22 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
 
   if (forced_kernel_threads) return forced_kernel_threads;
 
-  // otherwise it depends on the opencl device type
+  // it can also depends on the opencl device type
 
-  u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
+  u32 kernel_threads = (u32) device_param->device_maxworkgroup_size;
 
   if (device_param->device_type & CL_DEVICE_TYPE_CPU)
   {
-    kernel_threads = MIN (kernel_threads, KERNEL_THREADS_MAX_CPU);
+    kernel_threads = MIN (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
+  }
+
+  // or if it requires for example a lot of local memory
+
+  const u32 limited_kernel_threads = hashconfig_limited_kernel_threads (hashcat_ctx, device_param);
+
+  if (limited_kernel_threads)
+  {
+    kernel_threads = MIN (kernel_threads, limited_kernel_threads);
   }
 
   return kernel_threads;
@@ -25994,7 +26063,7 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
 
 u32 hashconfig_get_kernel_loops (hashcat_ctx_t *hashcat_ctx)
 {
-  hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
+  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
   const user_options_t *user_options = hashcat_ctx->user_options;
 
   u32 kernel_loops_fixed = 0;
diff --git a/src/opencl.c b/src/opencl.c
index ca5960e7e..f70b31694 100644
--- a/src/opencl.c
+++ b/src/opencl.c
@@ -3306,6 +3306,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         device_param->skipped = true;
       }
 
+      device_param->device_local_mem_size = device_local_mem_size;
+
       // If there's both an Intel CPU and an AMD OpenCL runtime it's a tricky situation
       // Both platforms support CPU device types and therefore both will try to use 100% of the physical resources
       // This results in both utilizing it for 50%
diff --git a/src/shared.c b/src/shared.c
index 7b0e957a9..ece503589 100644
--- a/src/shared.c
+++ b/src/shared.c
@@ -506,3 +506,32 @@ u64 hc_strtoull (const char *nptr, char **endptr, int base)
 {
   return (u64) strtoull (nptr, endptr, base);
 }
+
+u32 power_of_two_ceil_32 (const u32 v)
+{
+  u32 r = v;
+
+  r--;
+
+  r |= r >> 1;
+  r |= r >> 2;
+  r |= r >> 4;
+  r |= r >> 8;
+  r |= r >> 16;
+
+  r++;
+
+  return r;
+}
+
+u32 power_of_two_floor_32 (const u32 v)
+{
+  u32 r = power_of_two_ceil_32 (v);
+
+  if (r > v)
+  {
+    r >>= 1;
+  }
+
+  return r;
+}
diff --git a/src/status.c b/src/status.c
index 0a8ff502b..6a6423d4a 100644
--- a/src/status.c
+++ b/src/status.c
@@ -1389,15 +1389,17 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d
 
   if (device_param->skipped == false)
   {
-    for (int i = 0; i < SPEED_CACHE; i++)
+    const u32 speed_pos = device_param->speed_pos;
+
+    for (int i = 0; i < speed_pos; i++)
     {
       speed_cnt  += device_param->speed_cnt[i];
       speed_msec += device_param->speed_msec[i];
     }
-  }
 
-  speed_cnt  /= SPEED_CACHE;
-  speed_msec /= SPEED_CACHE;
+    speed_cnt  /= speed_pos;
+    speed_msec /= speed_pos;
+  }
 
   double hashes_dev_msec = 0;