diff --git a/src/interface.c b/src/interface.c
index c73e8628a..e80f24caf 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -26017,6 +26017,16 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
 
   kernel_threads = MIN (kernel_threads, device_param->device_maxworkgroup_size);
 
+  // and (3) if an OpenCL device allows a very high thread count (for example 1024 on nvidia),
+  // the host memory required is 32 times as high with 32 (It jumps from 128MB to 4GB device memory requirement).
+  // since there's no device with that much device memory (because of 1/4 memory rule) it has to limit the
+  // kernel_accel_max to be a very low number because the pws buffer will be so large otherwise.
+  // therefore autotune will be unable to calculate a good kernel_accel multiplier.
+  // currently there's no OpenCL device known that needs result in a better performance with 1024 threads compared to 256.
+  // as a result, we limit the number of threads to 256, which turns out to be a general good value.
+
+  kernel_threads = MIN (kernel_threads, 256);
+
   return kernel_threads;
 }