From fd89cac052e4f0dc8c8539b083c33e31dae7852d Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Fri, 9 Feb 2018 23:41:30 +0100
Subject: [PATCH] No longer need to warm-up OpenCL device for benchmark with
 new thread settings

---
 src/opencl.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/opencl.c b/src/opencl.c
index 4819035da..d9cf510c8 100644
--- a/src/opencl.c
+++ b/src/opencl.c
@@ -2460,6 +2460,8 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
         if (CL_rc == -1) return -1;
       }
 
+      /*
+      // this writes speed cache, we dont want it
       if (user_options->speed_only == true)
       {
         for (int i = 0; i < 16; i++)
@@ -2471,6 +2473,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
         hc_timer_set (&device_param->timer_speed);
       }
+      */
 
       const int rc = choose_kernel (hashcat_ctx, device_param, highest_pw_len, pws_cnt, fast_iteration, salt_pos);
 
@@ -4002,25 +4005,14 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
     }
 
     // We can't have SIMD in kernels where we have an unknown final password length
+    // It also turns out that pure kernels (that have a higher register pressure)
+    // actually run faster on scalar GPU (like 1080) without SIMD
 
     if ((hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) == 0)
     {
-      if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+      if (device_param->device_type & CL_DEVICE_TYPE_GPU)
       {
-        if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT)
-        {
-          vector_width = 1;
-        }
-        else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
-        {
-          vector_width = 1;
-        }
-      }
-      else
-      {
-        // there's currently no slow kernel that supports NEW_SIMD_CODE which
-        // has unknown final password length in _loop kernel
-        // vector_width = 1;
+        vector_width = 1;
       }
     }
 
@@ -4372,7 +4364,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
     // this value should represent a reasonable amount of memory a host system has per GPU.
     // note we're allocating 3 blocks of that size.
 
-    #define PWS_SPACE (512 * 1024 * 1024)
+    #define PWS_SPACE (64 * 1024 * 1024)
 
     // sometimes device_global_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
     // let's add some extra space just to be sure.