From 4df00033d71b52295996c22a82f28a154ec59c22 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sat, 4 May 2019 10:44:03 +0200
Subject: [PATCH] Prepare CUDA events

---
 include/backend.h |  14 +-
 include/types.h   |   1 -
 src/backend.c     | 734 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 559 insertions(+), 190 deletions(-)

diff --git a/include/backend.h b/include/backend.h
index fede1f92b..057edb87f 100644
--- a/include/backend.h
+++ b/include/backend.h
@@ -42,27 +42,33 @@ int hc_nvrtcGetPTX               (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog,
 int hc_cuCtxCreate               (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
 int hc_cuCtxDestroy              (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
 int hc_cuCtxSetCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
 int hc_cuDeviceGetAttribute      (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
 int hc_cuDeviceGetCount          (hashcat_ctx_t *hashcat_ctx, int *count);
 int hc_cuDeviceGet               (hashcat_ctx_t *hashcat_ctx, CUdevice *device, int ordinal);
 int hc_cuDeviceGetName           (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev);
 int hc_cuDeviceTotalMem          (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev);
 int hc_cuDriverGetVersion        (hashcat_ctx_t *hashcat_ctx, int *driverVersion);
+int hc_cuEventCreate             (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags);
+int hc_cuEventDestroy            (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuEventElapsedTime        (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd);
+int hc_cuEventQuery              (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuEventRecord             (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream);
+int hc_cuEventSynchronize        (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
 int hc_cuInit                    (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
+int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
 int hc_cuMemAlloc                (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
 int hc_cuMemcpyDtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
 int hc_cuMemcpyDtoH              (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
 int hc_cuMemcpyHtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
 int hc_cuMemFree                 (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
 int hc_cuModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
 int hc_cuModuleUnload            (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
-int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
-int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
 int hc_cuStreamCreate            (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags);
 int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
 int hc_cuStreamSynchronize       (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
-int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
 
 int hc_clBuildProgram            (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
 int hc_clCreateBuffer            (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem);
diff --git a/include/types.h b/include/types.h
index 861b27858..5ff50d5d4 100644
--- a/include/types.h
+++ b/include/types.h
@@ -1206,7 +1206,6 @@ typedef struct hc_device_param
 
   double  spin_damp;
 
-
   void   *kernel_params[PARAMCNT];
   void   *kernel_params_mp[PARAMCNT];
   void   *kernel_params_mp_r[PARAMCNT];
diff --git a/src/backend.c b/src/backend.c
index 47c90887c..434abb364 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -1599,6 +1599,168 @@ int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
   return 0;
 }
 
+int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventDestroy (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventQuery (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventSynchronize (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 // OpenCL
 
 int ocl_init (hashcat_ctx_t *hashcat_ctx)
@@ -2623,249 +2785,424 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
   device_param->kernel_params_buf64[34] = num;
 
-  u64       kernel_threads = 0;
-  cl_kernel kernel = NULL;
+  u64 kernel_threads = 0;
 
   switch (kern_run)
   {
-    case KERN_RUN_1:
-      kernel          = device_param->opencl_kernel1;
-      kernel_threads  = device_param->kernel_wgs1;
-      break;
-    case KERN_RUN_12:
-      kernel          = device_param->opencl_kernel12;
-      kernel_threads  = device_param->kernel_wgs12;
-      break;
-    case KERN_RUN_2:
-      kernel          = device_param->opencl_kernel2;
-      kernel_threads  = device_param->kernel_wgs2;
-      break;
-    case KERN_RUN_23:
-      kernel          = device_param->opencl_kernel23;
-      kernel_threads  = device_param->kernel_wgs23;
-      break;
-    case KERN_RUN_3:
-      kernel          = device_param->opencl_kernel3;
-      kernel_threads  = device_param->kernel_wgs3;
-      break;
-    case KERN_RUN_4:
-      kernel          = device_param->opencl_kernel4;
-      kernel_threads  = device_param->kernel_wgs4;
-      break;
-    case KERN_RUN_INIT2:
-      kernel          = device_param->opencl_kernel_init2;
-      kernel_threads  = device_param->kernel_wgs_init2;
-      break;
-    case KERN_RUN_LOOP2:
-      kernel          = device_param->opencl_kernel_loop2;
-      kernel_threads  = device_param->kernel_wgs_loop2;
-      break;
-    case KERN_RUN_AUX1:
-      kernel          = device_param->opencl_kernel_aux1;
-      kernel_threads  = device_param->kernel_wgs_aux1;
-      break;
-    case KERN_RUN_AUX2:
-      kernel          = device_param->opencl_kernel_aux2;
-      kernel_threads  = device_param->kernel_wgs_aux2;
-      break;
-    case KERN_RUN_AUX3:
-      kernel          = device_param->opencl_kernel_aux3;
-      kernel_threads  = device_param->kernel_wgs_aux3;
-      break;
-    case KERN_RUN_AUX4:
-      kernel          = device_param->opencl_kernel_aux4;
-      kernel_threads  = device_param->kernel_wgs_aux4;
-      break;
-    default:
-      event_log_error (hashcat_ctx, "Invalid kernel specified.");
-      return -1;
+    case KERN_RUN_1:      kernel_threads  = device_param->kernel_wgs1;      break;
+    case KERN_RUN_12:     kernel_threads  = device_param->kernel_wgs12;     break;
+    case KERN_RUN_2:      kernel_threads  = device_param->kernel_wgs2;      break;
+    case KERN_RUN_23:     kernel_threads  = device_param->kernel_wgs23;     break;
+    case KERN_RUN_3:      kernel_threads  = device_param->kernel_wgs3;      break;
+    case KERN_RUN_4:      kernel_threads  = device_param->kernel_wgs4;      break;
+    case KERN_RUN_INIT2:  kernel_threads  = device_param->kernel_wgs_init2; break;
+    case KERN_RUN_LOOP2:  kernel_threads  = device_param->kernel_wgs_loop2; break;
+    case KERN_RUN_AUX1:   kernel_threads  = device_param->kernel_wgs_aux1;  break;
+    case KERN_RUN_AUX2:   kernel_threads  = device_param->kernel_wgs_aux2;  break;
+    case KERN_RUN_AUX3:   kernel_threads  = device_param->kernel_wgs_aux3;  break;
+    case KERN_RUN_AUX4:   kernel_threads  = device_param->kernel_wgs_aux4;  break;
   }
 
   kernel_threads = MIN (kernel_threads, device_param->kernel_threads);
 
   // kernel_threads = power_of_two_floor_32 (kernel_threads);
 
-  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+  if (device_param->is_cuda == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+  }
 
   int CL_rc;
+  int CU_rc;
 
-  for (u32 i = 0; i <= 23; i++)
+  if (device_param->is_cuda == true)
   {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_mem), device_param->kernel_params[i]);
+    CUfunction cuda_function = NULL;
 
-    if (CL_rc == -1) return -1;
-  }
-
-  for (u32 i = 24; i <= 33; i++)
-  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_uint), device_param->kernel_params[i]);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  for (u32 i = 34; i <= 34; i++)
-  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  cl_event event;
-
-  if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
-  {
-    const size_t global_work_size[3] = { num_elements,  32, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
-
-    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &event);
-
-    if (CL_rc == -1) return -1;
-  }
-  else
-  {
-    if (kern_run == KERN_RUN_1)
+    if (device_param->is_cuda == true)
     {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+      switch (kern_run)
       {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_2)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
-      }
-    }
-    else if (kern_run == KERN_RUN_3)
-    {
-      if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
-      {
-        num_elements = CEILDIV (num_elements, device_param->vector_width);
+        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;      break;
+        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;     break;
+        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;      break;
+        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;     break;
+        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;      break;
+        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;      break;
+        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2; break;
+        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2; break;
+        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;  break;
+        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;  break;
+        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;  break;
+        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;  break;
       }
     }
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    CUevent cuda_event;
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+/*
+    if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
+    {
+      const size_t global_work_size[3] = { num_elements,  32, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
-    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &event);
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+    else
+    {
+      if (kern_run == KERN_RUN_1)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_2)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_3)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+      const size_t global_work_size[3] = { num_elements,   1, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
 
     if (CL_rc == -1) return -1;
-  }
 
-  CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+    // spin damper section
 
-  if (CL_rc == -1) return -1;
+    const u32 iterationm = iteration % EXPECTED_ITERATIONS;
 
-  // spin damper section
+    cl_int opencl_event_status;
 
-  const u32 iterationm = iteration % EXPECTED_ITERATIONS;
+    size_t param_value_size_ret;
 
-  cl_int event_status;
+    CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
 
-  size_t param_value_size_ret;
+    if (CL_rc == -1) return -1;
 
-  CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, &param_value_size_ret);
+    if (device_param->spin_damp > 0)
+    {
+      double spin_total = device_param->spin_damp;
 
-  if (CL_rc == -1) return -1;
+      while (opencl_event_status != CL_COMPLETE)
+      {
+        if (status_ctx->devices_status == STATUS_RUNNING)
+        {
+          switch (kern_run)
+          {
+            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+          }
+        }
+        else
+        {
+          // we were told to be nice
 
-  if (device_param->spin_damp > 0)
-  {
-    double spin_total = device_param->spin_damp;
+          sleep (0);
+        }
 
-    while (event_status != CL_COMPLETE)
+        CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
+
+        if (CL_rc == -1) return -1;
+
+        spin_total += device_param->spin_damp;
+
+        if (spin_total > 1) break;
+      }
+    }
+
+    CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    cl_ulong time_start;
+    cl_ulong time_end;
+
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;
+
+    const double exec_us = (double) (time_end - time_start) / 1000;
+
+    if (device_param->spin_damp > 0)
     {
       if (status_ctx->devices_status == STATUS_RUNNING)
       {
         switch (kern_run)
         {
-          case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
-          case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
-          case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
-          case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
-          case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
-          case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
-          case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
+          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
+          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
+          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
+          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
+          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
+          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
         }
       }
-      else
-      {
-        // we were told to be nice
+    }
 
-        sleep (0);
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
+
+      device_param->exec_msec[exec_pos] = exec_us / 1000;
+
+      exec_pos++;
+
+      if (exec_pos == EXEC_CACHE)
+      {
+        exec_pos = 0;
       }
 
-      CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, &param_value_size_ret);
-
-      if (CL_rc == -1) return -1;
-
-      spin_total += device_param->spin_damp;
-
-      if (spin_total > 1) break;
+      device_param->exec_pos = exec_pos;
     }
+
+    CL_rc = hc_clReleaseEvent (hashcat_ctx, opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+*/
   }
 
-  CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &event);
-
-  if (CL_rc == -1) return -1;
-
-  cl_ulong time_start;
-  cl_ulong time_end;
-
-  CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
-  CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;
-
-  const double exec_us = (double) (time_end - time_start) / 1000;
-
-  if (device_param->spin_damp > 0)
+  if (device_param->is_opencl == true)
   {
-    if (status_ctx->devices_status == STATUS_RUNNING)
+    cl_kernel opencl_kernel = NULL;
+
+    if (device_param->is_opencl == true)
     {
       switch (kern_run)
       {
-        case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
-        case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
-        case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
-        case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
-        case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
-        case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
-        case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
-        case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
-        case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
-        case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
+        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;      break;
+        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;     break;
+        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;      break;
+        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;     break;
+        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;      break;
+        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;      break;
+        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2; break;
+        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2; break;
+        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;  break;
+        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;  break;
+        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;  break;
+        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;  break;
       }
     }
-  }
 
-  if (event_update)
-  {
-    u32 exec_pos = device_param->exec_pos;
-
-    device_param->exec_msec[exec_pos] = exec_us / 1000;
-
-    exec_pos++;
-
-    if (exec_pos == EXEC_CACHE)
+    for (u32 i = 0; i <= 23; i++)
     {
-      exec_pos = 0;
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
     }
 
-    device_param->exec_pos = exec_pos;
+    for (u32 i = 24; i <= 33; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    for (u32 i = 34; i <= 34; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    cl_event opencl_event;
+
+    if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
+    {
+      const size_t global_work_size[3] = { num_elements,  32, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+    else
+    {
+      if (kern_run == KERN_RUN_1)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_2)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_3)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+      const size_t global_work_size[3] = { num_elements,   1, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    // spin damper section
+
+    const u32 iterationm = iteration % EXPECTED_ITERATIONS;
+
+    cl_int opencl_event_status;
+
+    size_t param_value_size_ret;
+
+    CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
+
+    if (CL_rc == -1) return -1;
+
+    if (device_param->spin_damp > 0)
+    {
+      double spin_total = device_param->spin_damp;
+
+      while (opencl_event_status != CL_COMPLETE)
+      {
+        if (status_ctx->devices_status == STATUS_RUNNING)
+        {
+          switch (kern_run)
+          {
+            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+          }
+        }
+        else
+        {
+          // we were told to be nice
+
+          sleep (0);
+        }
+
+        CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
+
+        if (CL_rc == -1) return -1;
+
+        spin_total += device_param->spin_damp;
+
+        if (spin_total > 1) break;
+      }
+    }
+
+    CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    cl_ulong time_start;
+    cl_ulong time_end;
+
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;
+
+    const double exec_us = (double) (time_end - time_start) / 1000;
+
+    if (device_param->spin_damp > 0)
+    {
+      if (status_ctx->devices_status == STATUS_RUNNING)
+      {
+        switch (kern_run)
+        {
+          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
+          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
+          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
+          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
+          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
+          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
+          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
+        }
+      }
+    }
+
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
+
+      device_param->exec_msec[exec_pos] = exec_us / 1000;
+
+      exec_pos++;
+
+      if (exec_pos == EXEC_CACHE)
+      {
+        exec_pos = 0;
+      }
+
+      device_param->exec_pos = exec_pos;
+    }
+
+    CL_rc = hc_clReleaseEvent (hashcat_ctx, opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
   }
 
-  CL_rc = hc_clReleaseEvent (hashcat_ctx, event);
-
-  if (CL_rc == -1) return -1;
-
-  CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
-
-  if (CL_rc == -1) return -1;
-
   return 0;
 }
 
@@ -9705,6 +10042,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_cuda == true)
     {
+      device_param->kernel_params[ 0] = &device_param->cuda_d_pws_buf;
+      device_param->kernel_params[ 4] = &device_param->cuda_d_tmps;
+      device_param->kernel_params[ 5] = &device_param->cuda_d_hooks;
     }
 
     if (device_param->is_opencl == true)
@@ -9729,6 +10069,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           if (device_param->is_cuda == true)
           {
+            device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->cuda_d_pws_buf
+                                              : &device_param->cuda_d_pws_amp_buf;
+
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1;
           }
 
           if (device_param->is_opencl == true)
@@ -9746,6 +10091,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (device_param->is_cuda == true)
         {
+          device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->cuda_d_pws_buf
+                                              : &device_param->cuda_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1;
         }
 
         if (device_param->is_opencl == true)
@@ -9766,6 +10116,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (device_param->is_cuda == true)
         {
+          device_param->kernel_params_amp[0] = &device_param->cuda_d_pws_buf;
+          device_param->kernel_params_amp[1] = &device_param->cuda_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 0, sizeof (cl_mem), device_param->kernel_params_amp[0]); if (CL_rc == -1) return -1;
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1;
         }
 
         if (device_param->is_opencl == true)
@@ -9781,6 +10136,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_cuda == true)
     {
+      device_param->kernel_params_decompress[0] = &device_param->cuda_d_pws_idx;
+      device_param->kernel_params_decompress[1] = &device_param->cuda_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                ? &device_param->cuda_d_pws_buf
+                                                : &device_param->cuda_d_pws_amp_buf;
+
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
     }
 
     if (device_param->is_opencl == true)