From 5c6501444ab844dd148d6a26da18073b4a420d1a Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 20 May 2021 14:34:24 +0200 Subject: [PATCH] Kernels: Add standalone true UTF8 to UTF16 converter kernel that runs after amplifier. Use OPTS_TYPE_POST_AMP_UTF16LE from plugin --- OpenCL/shared.cl | 21 ++++++++++ docs/changes.txt | 1 + include/backend.h | 2 + include/types.h | 11 ++++++ src/backend.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++ src/selftest.c | 13 +++++++ 6 files changed, 146 insertions(+) diff --git a/OpenCL/shared.cl b/OpenCL/shared.cl index 47c37c3c6..4feddab6f 100644 --- a/OpenCL/shared.cl +++ b/OpenCL/shared.cl @@ -212,3 +212,24 @@ KERNEL_FQ void gpu_atinit (GLOBAL_AS pw_t *buf, const u64 gid_max) buf[gid] = pw; } + +KERNEL_FQ void gpu_utf8_to_utf16 (GLOBAL_AS pw_t *pws_buf, const u64 gid_max) +{ + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + pw_t pw_in = pws_buf[gid]; + + pw_t pw_out; + + for (int i = 0; i < 64; i++) pw_out.i[i] = 0; + + hc_enc_t hc_enc; + + hc_enc_init (&hc_enc); + + pw_out.pw_len = hc_enc_next (&hc_enc, pw_in.i, pw_in.pw_len, 64, pw_out.i, 256); + + pws_buf[gid] = pw_out; +} diff --git a/docs/changes.txt b/docs/changes.txt index 4c58bab55..962152833 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -23,6 +23,7 @@ - Dependencies: Updated LZMA SDK from 19.00 to 21.02 alpha - Dependencies: Updated xxHash from 0.1.0 to v0.8.0 - Stable XXH3 +- Kernels: Add standalone true UTF8 to UTF16 converter kernel that runs after amplifier. Use OPTS_TYPE_POST_AMP_UTF16LE from plugin * changes v6.2.0 -> v6.2.1 diff --git a/include/backend.h b/include/backend.h index d363329f9..2e67e3e7e 100644 --- a/include/backend.h +++ b/include/backend.h @@ -122,10 +122,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 pws_cnt, const u8 chr); int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num); +int run_cuda_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num); int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size); int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size); int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num); +int run_opencl_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num); int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size); int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size); diff --git a/include/types.h b/include/types.h index ebb71f870..7b2cc1d46 100644 --- a/include/types.h +++ b/include/types.h @@ -442,6 +442,7 @@ typedef enum opts_type OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 51), // some algos use JiT in combinations with a salt or create too much startup time OPTS_TYPE_MP_MULTI_DISABLE = (1ULL << 52), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings OPTS_TYPE_NATIVE_THREADS = (1ULL << 53), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps) + OPTS_TYPE_POST_AMP_UTF16LE = (1ULL << 54), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers } opts_type_t; @@ -1116,6 +1117,7 @@ typedef struct hc_device_param u32 kernel_wgs_tm; u32 kernel_wgs_memset; u32 kernel_wgs_atinit; + u32 kernel_wgs_utf8toutf16le; u32 kernel_wgs_decompress; u32 kernel_wgs_aux1; u32 kernel_wgs_aux2; @@ -1140,6 +1142,7 @@ typedef struct hc_device_param u32 kernel_preferred_wgs_multiple_tm; u32 kernel_preferred_wgs_multiple_memset; u32 kernel_preferred_wgs_multiple_atinit; + u32 kernel_preferred_wgs_multiple_utf8toutf16le; u32 kernel_preferred_wgs_multiple_decompress; u32 kernel_preferred_wgs_multiple_aux1; u32 kernel_preferred_wgs_multiple_aux2; @@ -1164,6 +1167,7 @@ typedef struct hc_device_param u64 kernel_local_mem_size_tm; u64 kernel_local_mem_size_memset; u64 kernel_local_mem_size_atinit; + u64 kernel_local_mem_size_utf8toutf16le; u64 kernel_local_mem_size_decompress; u64 kernel_local_mem_size_aux1; u64 kernel_local_mem_size_aux2; @@ -1188,6 +1192,7 @@ typedef struct hc_device_param u64 kernel_dynamic_local_mem_size_tm; u64 kernel_dynamic_local_mem_size_memset; u64 kernel_dynamic_local_mem_size_atinit; + u64 kernel_dynamic_local_mem_size_utf8toutf16le; u64 kernel_dynamic_local_mem_size_decompress; u64 kernel_dynamic_local_mem_size_aux1; u64 kernel_dynamic_local_mem_size_aux2; @@ -1348,6 +1353,7 @@ typedef struct hc_device_param void *kernel_params_tm[PARAMCNT]; void *kernel_params_memset[PARAMCNT]; void *kernel_params_atinit[PARAMCNT]; + void *kernel_params_utf8toutf16le[PARAMCNT]; void *kernel_params_decompress[PARAMCNT]; u32 kernel_params_buf32[PARAMCNT]; @@ -1371,6 +1377,9 @@ typedef struct hc_device_param u32 kernel_params_atinit_buf32[PARAMCNT]; u64 kernel_params_atinit_buf64[PARAMCNT]; + u32 kernel_params_utf8toutf16le_buf32[PARAMCNT]; + u64 kernel_params_utf8toutf16le_buf64[PARAMCNT]; + u32 kernel_params_decompress_buf32[PARAMCNT]; u64 kernel_params_decompress_buf64[PARAMCNT]; @@ -1410,6 +1419,7 @@ typedef struct hc_device_param CUfunction cuda_function_tm; CUfunction cuda_function_memset; CUfunction cuda_function_atinit; + CUfunction cuda_function_utf8toutf16le; CUfunction cuda_function_decompress; CUfunction cuda_function_aux1; CUfunction cuda_function_aux2; @@ -1494,6 +1504,7 @@ typedef struct hc_device_param cl_kernel opencl_kernel_tm; cl_kernel opencl_kernel_memset; cl_kernel opencl_kernel_atinit; + cl_kernel opencl_kernel_utf8toutf16le; cl_kernel opencl_kernel_decompress; cl_kernel opencl_kernel_aux1; cl_kernel opencl_kernel_aux2; diff --git a/src/backend.c b/src/backend.c index de545fc27..1cfee2a1d 100644 --- a/src/backend.c +++ b/src/backend.c @@ -3037,6 +3037,19 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, if (run_kernel_amp (hashcat_ctx, device_param, pws_cnt) == -1) return -1; } + if (hashconfig->opts_type & OPTS_TYPE_POST_AMP_UTF16LE) + { + if (device_param->is_cuda == true) + { + if (run_cuda_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, pws_cnt) == -1) return -1; + } + + if (device_param->is_opencl == true) + { + if (run_opencl_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, pws_cnt) == -1) return -1; + } + } + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, pws_pos, pws_cnt, false, 0) == -1) return -1; if (hashconfig->opts_type & OPTS_TYPE_HOOK12) @@ -3414,6 +3427,26 @@ int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic return 0; } +int run_cuda_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num) +{ + u64 num_elements = num; + + device_param->kernel_params_utf8toutf16le[0] = (void *) &buf; + device_param->kernel_params_utf8toutf16le_buf64[1] = num_elements; + + const u64 kernel_threads = device_param->kernel_wgs_utf8toutf16le; + + num_elements = CEILDIV (num_elements, kernel_threads); + + CUfunction function = device_param->cuda_function_utf8toutf16le; + + if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_utf8toutf16le, NULL) == -1) return -1; + + if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; + + return 0; +} + int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size) { const u64 num16d = size / 16; @@ -3495,6 +3528,34 @@ int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *dev return 0; } +int run_opencl_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num) +{ + u64 num_elements = num; + + device_param->kernel_params_utf8toutf16le_buf64[1] = num_elements; + + const u64 kernel_threads = device_param->kernel_wgs_utf8toutf16le; + + num_elements = round_up_multiple_64 (num_elements, kernel_threads); + + cl_kernel kernel = device_param->opencl_kernel_utf8toutf16le; + + const size_t global_work_size[3] = { num_elements, 1, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf) == -1) return -1; + + if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_ulong), device_param->kernel_params_utf8toutf16le[1]) == -1) return -1; + + if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1; + + if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1; + + if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1; + + return 0; +} + int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size) { const u64 num16d = size / 16; @@ -8681,6 +8742,18 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1; device_param->kernel_preferred_wgs_multiple_decompress = device_param->cuda_warp_size; + + // GPU utf8 to utf16le conversion + + if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_utf8toutf16le, device_param->cuda_module_shared, "gpu_utf8_to_utf16") == -1) return -1; + + if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_utf8toutf16le, &device_param->kernel_wgs_utf8toutf16le) == -1) return -1; + + if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_utf8toutf16le, &device_param->kernel_local_mem_size_utf8toutf16le) == -1) return -1; + + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_utf8toutf16le, &device_param->kernel_dynamic_local_mem_size_utf8toutf16le) == -1) return -1; + + device_param->kernel_preferred_wgs_multiple_utf8toutf16le = device_param->cuda_warp_size; } if (device_param->is_opencl == true) @@ -8720,6 +8793,18 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_dynamic_local_mem_size_decompress) == -1) return -1; if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_preferred_wgs_multiple_decompress) == -1) return -1; + + // GPU utf8 to utf16le conversion + + if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_utf8_to_utf16", &device_param->opencl_kernel_utf8toutf16le) == -1) return -1; + + if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_wgs_utf8toutf16le) == -1) return -1; + + if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_local_mem_size_utf8toutf16le) == -1) return -1; + + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_dynamic_local_mem_size_utf8toutf16le) == -1) return -1; + + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_utf8toutf16le, &device_param->kernel_preferred_wgs_multiple_utf8toutf16le) == -1) return -1; } } @@ -9417,6 +9502,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_atinit[0] = NULL; device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1]; + device_param->kernel_params_utf8toutf16le_buf64[1] = 0; // gid_max + + device_param->kernel_params_utf8toutf16le[0] = NULL; + device_param->kernel_params_utf8toutf16le[1] = &device_param->kernel_params_utf8toutf16le_buf64[1]; + device_param->kernel_params_decompress_buf64[3] = 0; // gid_max if (device_param->is_cuda == true) @@ -10058,6 +10148,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]) == -1) return -1; if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]) == -1) return -1; + // GPU utf8 to utf16le init + + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_utf8toutf16le, 0, sizeof (cl_mem), device_param->kernel_params_utf8toutf16le[0]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_utf8toutf16le, 1, sizeof (cl_ulong), device_param->kernel_params_utf8toutf16le[1]) == -1) return -1; + // GPU decompress if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]) == -1) return -1; @@ -11242,6 +11337,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->cuda_function_amp = NULL; device_param->cuda_function_memset = NULL; device_param->cuda_function_atinit = NULL; + device_param->cuda_function_utf8toutf16le = NULL; device_param->cuda_function_decompress = NULL; device_param->cuda_function_aux1 = NULL; device_param->cuda_function_aux2 = NULL; @@ -11313,6 +11409,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->opencl_kernel_amp) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_amp); if (device_param->opencl_kernel_memset) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_memset); if (device_param->opencl_kernel_atinit) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_atinit); + if (device_param->opencl_kernel_utf8toutf16le) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_utf8toutf16le); if (device_param->opencl_kernel_decompress)hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_decompress); if (device_param->opencl_kernel_aux1) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux1); if (device_param->opencl_kernel_aux2) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_aux2); @@ -11382,6 +11479,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->opencl_kernel_amp = NULL; device_param->opencl_kernel_memset = NULL; device_param->opencl_kernel_atinit = NULL; + device_param->opencl_kernel_utf8toutf16le = NULL; device_param->opencl_kernel_decompress = NULL; device_param->opencl_kernel_aux1 = NULL; device_param->opencl_kernel_aux2 = NULL; diff --git a/src/selftest.c b/src/selftest.c index 4b752338d..088f6f225 100644 --- a/src/selftest.c +++ b/src/selftest.c @@ -439,6 +439,19 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param { // missing handling hooks + if (hashconfig->opts_type & OPTS_TYPE_POST_AMP_UTF16LE) + { + if (device_param->is_cuda == true) + { + if (run_cuda_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, 1) == -1) return -1; + } + + if (device_param->is_opencl == true) + { + if (run_opencl_kernel_utf8toutf16le (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, 1) == -1) return -1; + } + } + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, 1, false, 0) == -1) return -1; if (hashconfig->opts_type & OPTS_TYPE_HOOK12)