Kernels: Add standalone true UTF8 to UTF16 converter kernel that runs after amplifier. Use OPTS_TYPE_POST_AMP_UTF16LE from plugin

2025-04-08 02:35:46 +00:00 · 2021-05-20 14:34:24 +02:00 · 2021-05-20 14:34:24 +02:00 · 5c6501444a
commit 5c6501444a
parent 1afbcb0827
6 changed files with 146 additions and 0 deletions
--- a/OpenCL/shared.cl
+++ b/OpenCL/shared.cl
@ -212,3 +212,24 @@ KERNEL_FQ void gpu_atinit (GLOBAL_AS pw_t *buf, const u64 gid_max)

  buf[gid] = pw;
 }
+
+KERNEL_FQ void gpu_utf8_to_utf16 (GLOBAL_AS pw_t *pws_buf, const u64 gid_max)
+{
+  const u64 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  pw_t pw_in = pws_buf[gid];
+
+  pw_t pw_out;
+
+  for (int i = 0; i < 64; i++) pw_out.i[i] = 0;
+
+  hc_enc_t hc_enc;
+
+  hc_enc_init (&hc_enc);
+
+  pw_out.pw_len = hc_enc_next (&hc_enc, pw_in.i, pw_in.pw_len, 64, pw_out.i, 256);
+
+  pws_buf[gid] = pw_out;
+}
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -23,6 +23,7 @@

 - Dependencies: Updated LZMA SDK from 19.00 to 21.02 alpha
 - Dependencies: Updated xxHash from 0.1.0 to v0.8.0 - Stable XXH3
+- Kernels: Add standalone true UTF8 to UTF16 converter kernel that runs after amplifier. Use OPTS_TYPE_POST_AMP_UTF16LE from plugin

 * changes v6.2.0 -> v6.2.1

--- a/include/backend.h
+++ b/include/backend.h
@ -122,10 +122,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 pws_cnt, const u8 chr);

 int run_cuda_kernel_atinit    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num);
+int run_cuda_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num);
 int run_cuda_kernel_memset    (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size);
 int run_cuda_kernel_bzero     (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size);

 int run_opencl_kernel_atinit  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num);
+int run_opencl_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num);
 int run_opencl_kernel_memset  (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size);
 int run_opencl_kernel_bzero   (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size);

--- a/include/types.h
+++ b/include/types.h
@ -442,6 +442,7 @@ typedef enum opts_type
  OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 51), // some algos use JiT in combinations with a salt or create too much startup time
  OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 52), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 53), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
+  OPTS_TYPE_POST_AMP_UTF16LE  = (1ULL << 54), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers

 } opts_type_t;

@ -1116,6 +1117,7 @@ typedef struct hc_device_param
  u32     kernel_wgs_tm;
  u32     kernel_wgs_memset;
  u32     kernel_wgs_atinit;
+  u32     kernel_wgs_utf8toutf16le;
  u32     kernel_wgs_decompress;
  u32     kernel_wgs_aux1;
  u32     kernel_wgs_aux2;
@ -1140,6 +1142,7 @@ typedef struct hc_device_param
  u32     kernel_preferred_wgs_multiple_tm;
  u32     kernel_preferred_wgs_multiple_memset;
  u32     kernel_preferred_wgs_multiple_atinit;
+  u32     kernel_preferred_wgs_multiple_utf8toutf16le;
  u32     kernel_preferred_wgs_multiple_decompress;
  u32     kernel_preferred_wgs_multiple_aux1;
  u32     kernel_preferred_wgs_multiple_aux2;
@ -1164,6 +1167,7 @@ typedef struct hc_device_param
  u64     kernel_local_mem_size_tm;
  u64     kernel_local_mem_size_memset;
  u64     kernel_local_mem_size_atinit;
+  u64     kernel_local_mem_size_utf8toutf16le;
  u64     kernel_local_mem_size_decompress;
  u64     kernel_local_mem_size_aux1;
  u64     kernel_local_mem_size_aux2;
@ -1188,6 +1192,7 @@ typedef struct hc_device_param
  u64     kernel_dynamic_local_mem_size_tm;
  u64     kernel_dynamic_local_mem_size_memset;
  u64     kernel_dynamic_local_mem_size_atinit;
+  u64     kernel_dynamic_local_mem_size_utf8toutf16le;
  u64     kernel_dynamic_local_mem_size_decompress;
  u64     kernel_dynamic_local_mem_size_aux1;
  u64     kernel_dynamic_local_mem_size_aux2;
@ -1348,6 +1353,7 @@ typedef struct hc_device_param
  void   *kernel_params_tm[PARAMCNT];
  void   *kernel_params_memset[PARAMCNT];
  void   *kernel_params_atinit[PARAMCNT];
+  void   *kernel_params_utf8toutf16le[PARAMCNT];
  void   *kernel_params_decompress[PARAMCNT];

  u32     kernel_params_buf32[PARAMCNT];
@ -1371,6 +1377,9 @@ typedef struct hc_device_param
  u32     kernel_params_atinit_buf32[PARAMCNT];
  u64     kernel_params_atinit_buf64[PARAMCNT];

+  u32     kernel_params_utf8toutf16le_buf32[PARAMCNT];
+  u64     kernel_params_utf8toutf16le_buf64[PARAMCNT];
+
  u32     kernel_params_decompress_buf32[PARAMCNT];
  u64     kernel_params_decompress_buf64[PARAMCNT];

@ -1410,6 +1419,7 @@ typedef struct hc_device_param
  CUfunction        cuda_function_tm;
  CUfunction        cuda_function_memset;
  CUfunction        cuda_function_atinit;
+  CUfunction        cuda_function_utf8toutf16le;
  CUfunction        cuda_function_decompress;
  CUfunction        cuda_function_aux1;
  CUfunction        cuda_function_aux2;
@ -1494,6 +1504,7 @@ typedef struct hc_device_param
  cl_kernel         opencl_kernel_tm;
  cl_kernel         opencl_kernel_memset;
  cl_kernel         opencl_kernel_atinit;
+  cl_kernel         opencl_kernel_utf8toutf16le;
  cl_kernel         opencl_kernel_decompress;
  cl_kernel         opencl_kernel_aux1;
  cl_kernel         opencl_kernel_aux2;
--- a/src/backend.c
+++ b/src/backend.c
@ -3037,6 +3037,19 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
        if (run_kernel_amp (hashcat_ctx, device_param, pws_cnt) == -1) return -1;
      }

+      if (hashconfig->opts_type & OPTS_TYPE_POST_AMP_UTF16LE)
+      {
+        if (device_param->is_cuda == true)
+        {
+          if (run_cuda_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, pws_cnt) == -1) return -1;
+        }
+
+        if (device_param->is_opencl == true)
+        {
+          if (run_opencl_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, pws_cnt) == -1) return -1;
+        }
+      }
+
      if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_pos, pws_cnt, false, 0) == -1) return -1;

      if (hashconfig->opts_type & OPTS_TYPE_HOOK12)
@ -3414,6 +3427,26 @@ int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
  return 0;
 }

+int run_cuda_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num)
+{
+  u64 num_elements = num;
+
+  device_param->kernel_params_utf8toutf16le[0]       = (void *) &buf;
+  device_param->kernel_params_utf8toutf16le_buf64[1] = num_elements;
+
+  const u64 kernel_threads = device_param->kernel_wgs_utf8toutf16le;
+
+  num_elements = CEILDIV (num_elements, kernel_threads);
+
+  CUfunction function = device_param->cuda_function_utf8toutf16le;
+
+  if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_utf8toutf16le, NULL) == -1) return -1;
+
+  if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
+
+  return 0;
+}
+
 int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size)
 {
  const u64 num16d = size / 16;
@ -3495,6 +3528,34 @@ int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *dev
  return 0;
 }

+int run_opencl_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num)
+{
+  u64 num_elements = num;
+
+  device_param->kernel_params_utf8toutf16le_buf64[1] = num_elements;
+
+  const u64 kernel_threads = device_param->kernel_wgs_utf8toutf16le;
+
+  num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+
+  cl_kernel kernel = device_param->opencl_kernel_utf8toutf16le;
+
+  const size_t global_work_size[3] = { num_elements,    1, 1 };
+  const size_t local_work_size[3]  = { kernel_threads,  1, 1 };
+
+  if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf) == -1) return -1;
+
+  if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_ulong), device_param->kernel_params_utf8toutf16le[1]) == -1) return -1;
+
+  if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1;
+
+  if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+
+  if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
+
+  return 0;
+}
+
 int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size)
 {
  const u64 num16d = size / 16;
@ -8681,6 +8742,18 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1;

        device_param->kernel_preferred_wgs_multiple_decompress = device_param->cuda_warp_size;
+
+        // GPU utf8 to utf16le conversion
+
+        if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_utf8toutf16le, device_param->cuda_module_shared, "gpu_utf8_to_utf16") == -1) return -1;
+
+        if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_utf8toutf16le, &device_param->kernel_wgs_utf8toutf16le) == -1) return -1;
+
+        if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_utf8toutf16le, &device_param->kernel_local_mem_size_utf8toutf16le) == -1) return -1;
+
+        if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_utf8toutf16le, &device_param->kernel_dynamic_local_mem_size_utf8toutf16le) == -1) return -1;
+
+        device_param->kernel_preferred_wgs_multiple_utf8toutf16le = device_param->cuda_warp_size;
      }

      if (device_param->is_opencl == true)
@ -8720,6 +8793,18 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1;

        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress) == -1) return -1;
+
+        // GPU utf8 to utf16le conversion
+
+        if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_utf8_to_utf16", &device_param->opencl_kernel_utf8toutf16le) == -1) return -1;
+
+        if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_wgs_utf8toutf16le) == -1) return -1;
+
+        if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_local_mem_size_utf8toutf16le) == -1) return -1;
+
+        if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_dynamic_local_mem_size_utf8toutf16le) == -1) return -1;
+
+        if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_preferred_wgs_multiple_utf8toutf16le) == -1) return -1;
      }
    }

@ -9417,6 +9502,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
    device_param->kernel_params_atinit[0] = NULL;
    device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1];

+    device_param->kernel_params_utf8toutf16le_buf64[1] = 0; // gid_max
+
+    device_param->kernel_params_utf8toutf16le[0] = NULL;
+    device_param->kernel_params_utf8toutf16le[1] = &device_param->kernel_params_utf8toutf16le_buf64[1];
+
    device_param->kernel_params_decompress_buf64[3] = 0; // gid_max

    if (device_param->is_cuda == true)
@ -10058,6 +10148,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem),   device_param->kernel_params_atinit[0]) == -1) return -1;
      if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]) == -1) return -1;

+      // GPU utf8 to utf16le init
+
+      if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_utf8toutf16le, 0, sizeof (cl_mem),   device_param->kernel_params_utf8toutf16le[0]) == -1) return -1;
+      if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_utf8toutf16le, 1, sizeof (cl_ulong), device_param->kernel_params_utf8toutf16le[1]) == -1) return -1;
+
      // GPU decompress

      if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem),   device_param->kernel_params_decompress[0]) == -1) return -1;
@ -11242,6 +11337,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
      device_param->cuda_function_amp         = NULL;
      device_param->cuda_function_memset      = NULL;
      device_param->cuda_function_atinit      = NULL;
+      device_param->cuda_function_utf8toutf16le = NULL;
      device_param->cuda_function_decompress  = NULL;
      device_param->cuda_function_aux1        = NULL;
      device_param->cuda_function_aux2        = NULL;
@ -11313,6 +11409,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
      if (device_param->opencl_kernel_amp)       hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_amp);
      if (device_param->opencl_kernel_memset)    hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_memset);
      if (device_param->opencl_kernel_atinit)    hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_atinit);
+      if (device_param->opencl_kernel_utf8toutf16le) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_utf8toutf16le);
      if (device_param->opencl_kernel_decompress)hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_decompress);
      if (device_param->opencl_kernel_aux1)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux1);
      if (device_param->opencl_kernel_aux2)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux2);
@ -11382,6 +11479,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
      device_param->opencl_kernel_amp          = NULL;
      device_param->opencl_kernel_memset       = NULL;
      device_param->opencl_kernel_atinit       = NULL;
+      device_param->opencl_kernel_utf8toutf16le = NULL;
      device_param->opencl_kernel_decompress   = NULL;
      device_param->opencl_kernel_aux1         = NULL;
      device_param->opencl_kernel_aux2         = NULL;
--- a/src/selftest.c
+++ b/src/selftest.c
@ -439,6 +439,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
  {
    // missing handling hooks

+    if (hashconfig->opts_type & OPTS_TYPE_POST_AMP_UTF16LE)
+    {
+      if (device_param->is_cuda == true)
+      {
+        if (run_cuda_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, 1) == -1) return -1;
+      }
+
+      if (device_param->is_opencl == true)
+      {
+        if (run_opencl_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, 1) == -1) return -1;
+      }
+    }
+
    if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, 1, false, 0) == -1) return -1;

    if (hashconfig->opts_type & OPTS_TYPE_HOOK12)