Prepare CUDA events

2025-06-30 20:02:41 +00:00 · 2019-05-04 10:44:03 +02:00 · 2019-05-04 10:44:03 +02:00 · 4df00033d7
commit 4df00033d7
parent f2948460c9
3 changed files with 559 additions and 190 deletions
--- a/include/backend.h
+++ b/include/backend.h
@ -42,27 +42,33 @@ int hc_nvrtcGetPTX               (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog,
 int hc_cuCtxCreate               (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev);
 int hc_cuCtxDestroy              (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
 int hc_cuCtxSetCurrent           (hashcat_ctx_t *hashcat_ctx, CUcontext ctx);
+int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);
 int hc_cuDeviceGetAttribute      (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev);
 int hc_cuDeviceGetCount          (hashcat_ctx_t *hashcat_ctx, int *count);
 int hc_cuDeviceGet               (hashcat_ctx_t *hashcat_ctx, CUdevice *device, int ordinal);
 int hc_cuDeviceGetName           (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev);
 int hc_cuDeviceTotalMem          (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev);
 int hc_cuDriverGetVersion        (hashcat_ctx_t *hashcat_ctx, int *driverVersion);
+int hc_cuEventCreate             (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags);
+int hc_cuEventDestroy            (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuEventElapsedTime        (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd);
+int hc_cuEventQuery              (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuEventRecord             (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream);
+int hc_cuEventSynchronize        (hashcat_ctx_t *hashcat_ctx, CUevent hEvent);
+int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
 int hc_cuInit                    (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
+int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
 int hc_cuMemAlloc                (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
 int hc_cuMemcpyDtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
 int hc_cuMemcpyDtoH              (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
 int hc_cuMemcpyHtoD              (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
 int hc_cuMemFree                 (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
 int hc_cuModuleLoadDataEx        (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
 int hc_cuModuleUnload            (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
-int hc_cuModuleGetFunction       (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
-int hc_cuFuncGetAttribute        (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc);
 int hc_cuStreamCreate            (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags);
 int hc_cuStreamDestroy           (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
 int hc_cuStreamSynchronize       (hashcat_ctx_t *hashcat_ctx, CUstream hStream);
-int hc_cuLaunchKernel            (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-int hc_cuCtxSynchronize          (hashcat_ctx_t *hashcat_ctx);

 int hc_clBuildProgram            (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
 int hc_clCreateBuffer            (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem);
--- a/include/types.h
+++ b/include/types.h
@ -1206,7 +1206,6 @@ typedef struct hc_device_param

  double  spin_damp;

-
  void   *kernel_params[PARAMCNT];
  void   *kernel_params_mp[PARAMCNT];
  void   *kernel_params_mp_r[PARAMCNT];
--- a/src/backend.c
+++ b/src/backend.c
@ -1599,6 +1599,168 @@ int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx)
  return 0;
 }

+int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventDestroy (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventQuery (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  CUDA_PTR *cuda = backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuEventSynchronize (hEvent);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 // OpenCL

 int ocl_init (hashcat_ctx_t *hashcat_ctx)
@ -2624,100 +2786,72 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
  device_param->kernel_params_buf64[34] = num;

  u64 kernel_threads = 0;
-  cl_kernel kernel = NULL;

  switch (kern_run)
  {
-    case KERN_RUN_1:
-      kernel          = device_param->opencl_kernel1;
-      kernel_threads  = device_param->kernel_wgs1;
-      break;
-    case KERN_RUN_12:
-      kernel          = device_param->opencl_kernel12;
-      kernel_threads  = device_param->kernel_wgs12;
-      break;
-    case KERN_RUN_2:
-      kernel          = device_param->opencl_kernel2;
-      kernel_threads  = device_param->kernel_wgs2;
-      break;
-    case KERN_RUN_23:
-      kernel          = device_param->opencl_kernel23;
-      kernel_threads  = device_param->kernel_wgs23;
-      break;
-    case KERN_RUN_3:
-      kernel          = device_param->opencl_kernel3;
-      kernel_threads  = device_param->kernel_wgs3;
-      break;
-    case KERN_RUN_4:
-      kernel          = device_param->opencl_kernel4;
-      kernel_threads  = device_param->kernel_wgs4;
-      break;
-    case KERN_RUN_INIT2:
-      kernel          = device_param->opencl_kernel_init2;
-      kernel_threads  = device_param->kernel_wgs_init2;
-      break;
-    case KERN_RUN_LOOP2:
-      kernel          = device_param->opencl_kernel_loop2;
-      kernel_threads  = device_param->kernel_wgs_loop2;
-      break;
-    case KERN_RUN_AUX1:
-      kernel          = device_param->opencl_kernel_aux1;
-      kernel_threads  = device_param->kernel_wgs_aux1;
-      break;
-    case KERN_RUN_AUX2:
-      kernel          = device_param->opencl_kernel_aux2;
-      kernel_threads  = device_param->kernel_wgs_aux2;
-      break;
-    case KERN_RUN_AUX3:
-      kernel          = device_param->opencl_kernel_aux3;
-      kernel_threads  = device_param->kernel_wgs_aux3;
-      break;
-    case KERN_RUN_AUX4:
-      kernel          = device_param->opencl_kernel_aux4;
-      kernel_threads  = device_param->kernel_wgs_aux4;
-      break;
-    default:
-      event_log_error (hashcat_ctx, "Invalid kernel specified.");
-      return -1;
+    case KERN_RUN_1:      kernel_threads  = device_param->kernel_wgs1;      break;
+    case KERN_RUN_12:     kernel_threads  = device_param->kernel_wgs12;     break;
+    case KERN_RUN_2:      kernel_threads  = device_param->kernel_wgs2;      break;
+    case KERN_RUN_23:     kernel_threads  = device_param->kernel_wgs23;     break;
+    case KERN_RUN_3:      kernel_threads  = device_param->kernel_wgs3;      break;
+    case KERN_RUN_4:      kernel_threads  = device_param->kernel_wgs4;      break;
+    case KERN_RUN_INIT2:  kernel_threads  = device_param->kernel_wgs_init2; break;
+    case KERN_RUN_LOOP2:  kernel_threads  = device_param->kernel_wgs_loop2; break;
+    case KERN_RUN_AUX1:   kernel_threads  = device_param->kernel_wgs_aux1;  break;
+    case KERN_RUN_AUX2:   kernel_threads  = device_param->kernel_wgs_aux2;  break;
+    case KERN_RUN_AUX3:   kernel_threads  = device_param->kernel_wgs_aux3;  break;
+    case KERN_RUN_AUX4:   kernel_threads  = device_param->kernel_wgs_aux4;  break;
  }

  kernel_threads = MIN (kernel_threads, device_param->kernel_threads);

  // kernel_threads = power_of_two_floor_32 (kernel_threads);

+  if (device_param->is_cuda == true)
+  {
+    num_elements = CEILDIV (num_elements, kernel_threads);
+  }
+
+  if (device_param->is_opencl == true)
+  {
    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+  }

  int CL_rc;
+  int CU_rc;

-  for (u32 i = 0; i <= 23; i++)
+  if (device_param->is_cuda == true)
  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_mem), device_param->kernel_params[i]);
+    CUfunction cuda_function = NULL;

-    if (CL_rc == -1) return -1;
+    if (device_param->is_cuda == true)
+    {
+      switch (kern_run)
+      {
+        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;      break;
+        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;     break;
+        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;      break;
+        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;     break;
+        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;      break;
+        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;      break;
+        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2; break;
+        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2; break;
+        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;  break;
+        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;  break;
+        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;  break;
+        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;  break;
+      }
    }

-  for (u32 i = 24; i <= 33; i++)
-  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_uint), device_param->kernel_params[i]);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  for (u32 i = 34; i <= 34; i++)
-  {
-    CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]);
-
-    if (CL_rc == -1) return -1;
-  }
-
-  cl_event event;
+    CUevent cuda_event;

+/*
    if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
    {
      const size_t global_work_size[3] = { num_elements,  32, 1 };
      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };

-    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &event);
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);

      if (CL_rc == -1) return -1;
    }
@ -2750,7 +2884,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      const size_t global_work_size[3] = { num_elements,   1, 1 };
      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };

-    CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &event);
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);

      if (CL_rc == -1) return -1;
    }
@ -2763,11 +2897,11 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con

    const u32 iterationm = iteration % EXPECTED_ITERATIONS;

-  cl_int event_status;
+    cl_int opencl_event_status;

    size_t param_value_size_ret;

-  CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, &param_value_size_ret);
+    CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);

    if (CL_rc == -1) return -1;

@ -2775,7 +2909,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
    {
      double spin_total = device_param->spin_damp;

-    while (event_status != CL_COMPLETE)
+      while (opencl_event_status != CL_COMPLETE)
      {
        if (status_ctx->devices_status == STATUS_RUNNING)
        {
@ -2800,7 +2934,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
          sleep (0);
        }

-      CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, &param_value_size_ret);
+        CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);

        if (CL_rc == -1) return -1;

@ -2810,15 +2944,15 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      }
    }

-  CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &event);
+    CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event);

    if (CL_rc == -1) return -1;

    cl_ulong time_start;
    cl_ulong time_end;

-  CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
-  CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;

    const double exec_us = (double) (time_end - time_start) / 1000;

@ -2858,13 +2992,216 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      device_param->exec_pos = exec_pos;
    }

-  CL_rc = hc_clReleaseEvent (hashcat_ctx, event);
+    CL_rc = hc_clReleaseEvent (hashcat_ctx, opencl_event);

    if (CL_rc == -1) return -1;

    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);

    if (CL_rc == -1) return -1;
+*/
+  }
+
+  if (device_param->is_opencl == true)
+  {
+    cl_kernel opencl_kernel = NULL;
+
+    if (device_param->is_opencl == true)
+    {
+      switch (kern_run)
+      {
+        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;      break;
+        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;     break;
+        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;      break;
+        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;     break;
+        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;      break;
+        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;      break;
+        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2; break;
+        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2; break;
+        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;  break;
+        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;  break;
+        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;  break;
+        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;  break;
+      }
+    }
+
+    for (u32 i = 0; i <= 23; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    for (u32 i = 24; i <= 33; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    for (u32 i = 34; i <= 34; i++)
+    {
+      CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    cl_event opencl_event;
+
+    if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF))
+    {
+      const size_t global_work_size[3] = { num_elements,  32, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+    else
+    {
+      if (kern_run == KERN_RUN_1)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_2)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+      else if (kern_run == KERN_RUN_3)
+      {
+        if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP)
+        {
+          num_elements = CEILDIV (num_elements, device_param->vector_width);
+        }
+      }
+
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+      const size_t global_work_size[3] = { num_elements,   1, 1 };
+      const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+      CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event);
+
+      if (CL_rc == -1) return -1;
+    }
+
+    CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+
+    // spin damper section
+
+    const u32 iterationm = iteration % EXPECTED_ITERATIONS;
+
+    cl_int opencl_event_status;
+
+    size_t param_value_size_ret;
+
+    CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
+
+    if (CL_rc == -1) return -1;
+
+    if (device_param->spin_damp > 0)
+    {
+      double spin_total = device_param->spin_damp;
+
+      while (opencl_event_status != CL_COMPLETE)
+      {
+        if (status_ctx->devices_status == STATUS_RUNNING)
+        {
+          switch (kern_run)
+          {
+            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+          }
+        }
+        else
+        {
+          // we were told to be nice
+
+          sleep (0);
+        }
+
+        CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, &param_value_size_ret);
+
+        if (CL_rc == -1) return -1;
+
+        spin_total += device_param->spin_damp;
+
+        if (spin_total > 1) break;
+      }
+    }
+
+    CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    cl_ulong time_start;
+    cl_ulong time_end;
+
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1;
+    CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END,   sizeof (time_end),   &time_end,   NULL); if (CL_rc == -1) return -1;
+
+    const double exec_us = (double) (time_end - time_start) / 1000;
+
+    if (device_param->spin_damp > 0)
+    {
+      if (status_ctx->devices_status == STATUS_RUNNING)
+      {
+        switch (kern_run)
+        {
+          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
+          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
+          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
+          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
+          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
+          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
+          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
+        }
+      }
+    }
+
+    if (event_update)
+    {
+      u32 exec_pos = device_param->exec_pos;
+
+      device_param->exec_msec[exec_pos] = exec_us / 1000;
+
+      exec_pos++;
+
+      if (exec_pos == EXEC_CACHE)
+      {
+        exec_pos = 0;
+      }
+
+      device_param->exec_pos = exec_pos;
+    }
+
+    CL_rc = hc_clReleaseEvent (hashcat_ctx, opencl_event);
+
+    if (CL_rc == -1) return -1;
+
+    CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue);
+
+    if (CL_rc == -1) return -1;
+  }

  return 0;
 }
@ -9705,6 +10042,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

    if (device_param->is_cuda == true)
    {
+      device_param->kernel_params[ 0] = &device_param->cuda_d_pws_buf;
+      device_param->kernel_params[ 4] = &device_param->cuda_d_tmps;
+      device_param->kernel_params[ 5] = &device_param->cuda_d_hooks;
    }

    if (device_param->is_opencl == true)
@ -9729,6 +10069,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
        {
          if (device_param->is_cuda == true)
          {
+            device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->cuda_d_pws_buf
+                                              : &device_param->cuda_d_pws_amp_buf;
+
+            //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1;
          }

          if (device_param->is_opencl == true)
@ -9746,6 +10091,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      {
        if (device_param->is_cuda == true)
        {
+          device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                              ? &device_param->cuda_d_pws_buf
+                                              : &device_param->cuda_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1;
        }

        if (device_param->is_opencl == true)
@ -9766,6 +10116,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      {
        if (device_param->is_cuda == true)
        {
+          device_param->kernel_params_amp[0] = &device_param->cuda_d_pws_buf;
+          device_param->kernel_params_amp[1] = &device_param->cuda_d_pws_amp_buf;
+
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 0, sizeof (cl_mem), device_param->kernel_params_amp[0]); if (CL_rc == -1) return -1;
+          //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1;
        }

        if (device_param->is_opencl == true)
@ -9781,6 +10136,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

    if (device_param->is_cuda == true)
    {
+      device_param->kernel_params_decompress[0] = &device_param->cuda_d_pws_idx;
+      device_param->kernel_params_decompress[1] = &device_param->cuda_d_pws_comp_buf;
+      device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+                                                ? &device_param->cuda_d_pws_buf
+                                                : &device_param->cuda_d_pws_amp_buf;
+
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1;
+      //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1;
    }

    if (device_param->is_opencl == true)