mirror of
https://github.com/hashcat/hashcat.git
synced 2024-11-22 08:08:10 +00:00
OpenCL Kernels: Moved "gpu_decompress", "gpu_memset" and "gpu_atinit" into new OpenCL/shared.cl in order to reduce compile time
This commit is contained in:
parent
08163501cf
commit
1fc37c25f9
@ -60899,145 +60899,3 @@ DECLSPEC void append_0x80_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
DECLSPEC void gpu_decompress_entry (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, pw_t *pw, const u64 gid)
|
||||
{
|
||||
const u32 off = pws_idx[gid].off;
|
||||
const u32 cnt = pws_idx[gid].cnt;
|
||||
const u32 len = pws_idx[gid].len;
|
||||
|
||||
#ifdef _unroll
|
||||
#pragma unroll
|
||||
#endif
|
||||
for (u32 i = 0; i < 64; i++)
|
||||
{
|
||||
pw->i[i] = 0;
|
||||
}
|
||||
|
||||
for (u32 i = 0, j = off; i < cnt; i++, j++)
|
||||
{
|
||||
pw->i[i] = pws_comp[j];
|
||||
}
|
||||
|
||||
pw->pw_len = len;
|
||||
}
|
||||
|
||||
KERNEL_FQ void gpu_decompress (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, GLOBAL_AS pw_t *pws_buf, const u64 gid_max)
|
||||
{
|
||||
const u64 gid = get_global_id (0);
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
pw_t pw;
|
||||
|
||||
gpu_decompress_entry (pws_idx, pws_comp, &pw, gid);
|
||||
|
||||
pws_buf[gid] = pw;
|
||||
}
|
||||
|
||||
KERNEL_FQ void gpu_memset (GLOBAL_AS uint4 *buf, const u32 value, const u64 gid_max)
|
||||
{
|
||||
const u64 gid = get_global_id (0);
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
uint4 r;
|
||||
|
||||
#if defined IS_NATIVE
|
||||
r = value;
|
||||
#elif defined IS_OPENCL
|
||||
r.s0 = value;
|
||||
r.s1 = value;
|
||||
r.s2 = value;
|
||||
r.s3 = value;
|
||||
#elif defined IS_CUDA
|
||||
r.x = value;
|
||||
r.y = value;
|
||||
r.z = value;
|
||||
r.w = value;
|
||||
#endif
|
||||
|
||||
buf[gid] = r;
|
||||
}
|
||||
|
||||
KERNEL_FQ void gpu_atinit (GLOBAL_AS pw_t *buf, const u64 gid_max)
|
||||
{
|
||||
const u64 gid = get_global_id (0);
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
const u32 l32 = l32_from_64_S (gid);
|
||||
const u32 h32 = h32_from_64_S (gid);
|
||||
|
||||
pw_t pw;
|
||||
|
||||
pw.i[ 0] = 0x5c5c5c5c ^ l32;
|
||||
pw.i[ 1] = 0x36363636 ^ h32;
|
||||
pw.i[ 2] = 0;
|
||||
pw.i[ 3] = 0;
|
||||
pw.i[ 4] = 0;
|
||||
pw.i[ 5] = 0;
|
||||
pw.i[ 6] = 0;
|
||||
pw.i[ 7] = 0;
|
||||
pw.i[ 8] = 0;
|
||||
pw.i[ 9] = 0;
|
||||
pw.i[10] = 0;
|
||||
pw.i[11] = 0;
|
||||
pw.i[12] = 0;
|
||||
pw.i[13] = 0;
|
||||
pw.i[14] = 0;
|
||||
pw.i[15] = 0;
|
||||
pw.i[16] = 0;
|
||||
pw.i[17] = 0;
|
||||
pw.i[18] = 0;
|
||||
pw.i[19] = 0;
|
||||
pw.i[20] = 0;
|
||||
pw.i[21] = 0;
|
||||
pw.i[22] = 0;
|
||||
pw.i[23] = 0;
|
||||
pw.i[24] = 0;
|
||||
pw.i[25] = 0;
|
||||
pw.i[26] = 0;
|
||||
pw.i[27] = 0;
|
||||
pw.i[28] = 0;
|
||||
pw.i[29] = 0;
|
||||
pw.i[30] = 0;
|
||||
pw.i[31] = 0;
|
||||
pw.i[32] = 0;
|
||||
pw.i[33] = 0;
|
||||
pw.i[34] = 0;
|
||||
pw.i[35] = 0;
|
||||
pw.i[36] = 0;
|
||||
pw.i[37] = 0;
|
||||
pw.i[38] = 0;
|
||||
pw.i[39] = 0;
|
||||
pw.i[40] = 0;
|
||||
pw.i[41] = 0;
|
||||
pw.i[42] = 0;
|
||||
pw.i[43] = 0;
|
||||
pw.i[44] = 0;
|
||||
pw.i[45] = 0;
|
||||
pw.i[46] = 0;
|
||||
pw.i[47] = 0;
|
||||
pw.i[48] = 0;
|
||||
pw.i[49] = 0;
|
||||
pw.i[50] = 0;
|
||||
pw.i[51] = 0;
|
||||
pw.i[52] = 0;
|
||||
pw.i[53] = 0;
|
||||
pw.i[54] = 0;
|
||||
pw.i[55] = 0;
|
||||
pw.i[56] = 0;
|
||||
pw.i[57] = 0;
|
||||
pw.i[58] = 0;
|
||||
pw.i[59] = 0;
|
||||
pw.i[60] = 0;
|
||||
pw.i[61] = 0;
|
||||
pw.i[62] = 0;
|
||||
pw.i[63] = 0; // yep that's faster
|
||||
|
||||
//pw.pw_len = 1 + (l32 & 15);
|
||||
pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500)
|
||||
|
||||
buf[gid] = pw;
|
||||
}
|
||||
|
@ -277,6 +277,5 @@ DECLSPEC void append_0x01_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const
|
||||
DECLSPEC void append_0x06_2x4_VV (u32x *w0, u32x *w1, const u32x offset);
|
||||
DECLSPEC void append_0x80_2x4_VV (u32x *w0, u32x *w1, const u32x offset);
|
||||
DECLSPEC void append_0x80_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32x offset);
|
||||
DECLSPEC void gpu_decompress_entry (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, pw_t *pw, const u64 gid);
|
||||
|
||||
#endif
|
||||
|
153
OpenCL/shared.cl
Normal file
153
OpenCL/shared.cl
Normal file
@ -0,0 +1,153 @@
|
||||
/**
|
||||
* Author......: See docs/credits.txt
|
||||
* License.....: MIT
|
||||
*/
|
||||
|
||||
#ifdef KERNEL_STATIC
|
||||
#include "inc_vendor.h"
|
||||
#include "inc_types.h"
|
||||
#include "inc_platform.cl"
|
||||
#include "inc_common.cl"
|
||||
#endif
|
||||
|
||||
DECLSPEC void gpu_decompress_entry (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, pw_t *pw, const u64 gid)
|
||||
{
|
||||
const u32 off = pws_idx[gid].off;
|
||||
const u32 cnt = pws_idx[gid].cnt;
|
||||
const u32 len = pws_idx[gid].len;
|
||||
|
||||
#ifdef _unroll
|
||||
#pragma unroll
|
||||
#endif
|
||||
for (u32 i = 0; i < 64; i++)
|
||||
{
|
||||
pw->i[i] = 0;
|
||||
}
|
||||
|
||||
for (u32 i = 0, j = off; i < cnt; i++, j++)
|
||||
{
|
||||
pw->i[i] = pws_comp[j];
|
||||
}
|
||||
|
||||
pw->pw_len = len;
|
||||
}
|
||||
|
||||
KERNEL_FQ void gpu_decompress (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, GLOBAL_AS pw_t *pws_buf, const u64 gid_max)
|
||||
{
|
||||
const u64 gid = get_global_id (0);
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
pw_t pw;
|
||||
|
||||
gpu_decompress_entry (pws_idx, pws_comp, &pw, gid);
|
||||
|
||||
pws_buf[gid] = pw;
|
||||
}
|
||||
|
||||
KERNEL_FQ void gpu_memset (GLOBAL_AS uint4 *buf, const u32 value, const u64 gid_max)
|
||||
{
|
||||
const u64 gid = get_global_id (0);
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
uint4 r;
|
||||
|
||||
#if defined IS_NATIVE
|
||||
r = value;
|
||||
#elif defined IS_OPENCL
|
||||
r.s0 = value;
|
||||
r.s1 = value;
|
||||
r.s2 = value;
|
||||
r.s3 = value;
|
||||
#elif defined IS_CUDA
|
||||
r.x = value;
|
||||
r.y = value;
|
||||
r.z = value;
|
||||
r.w = value;
|
||||
#endif
|
||||
|
||||
buf[gid] = r;
|
||||
}
|
||||
|
||||
KERNEL_FQ void gpu_atinit (GLOBAL_AS pw_t *buf, const u64 gid_max)
|
||||
{
|
||||
const u64 gid = get_global_id (0);
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
const u32 l32 = l32_from_64_S (gid);
|
||||
const u32 h32 = h32_from_64_S (gid);
|
||||
|
||||
pw_t pw;
|
||||
|
||||
pw.i[ 0] = 0x5c5c5c5c ^ l32;
|
||||
pw.i[ 1] = 0x36363636 ^ h32;
|
||||
pw.i[ 2] = 0;
|
||||
pw.i[ 3] = 0;
|
||||
pw.i[ 4] = 0;
|
||||
pw.i[ 5] = 0;
|
||||
pw.i[ 6] = 0;
|
||||
pw.i[ 7] = 0;
|
||||
pw.i[ 8] = 0;
|
||||
pw.i[ 9] = 0;
|
||||
pw.i[10] = 0;
|
||||
pw.i[11] = 0;
|
||||
pw.i[12] = 0;
|
||||
pw.i[13] = 0;
|
||||
pw.i[14] = 0;
|
||||
pw.i[15] = 0;
|
||||
pw.i[16] = 0;
|
||||
pw.i[17] = 0;
|
||||
pw.i[18] = 0;
|
||||
pw.i[19] = 0;
|
||||
pw.i[20] = 0;
|
||||
pw.i[21] = 0;
|
||||
pw.i[22] = 0;
|
||||
pw.i[23] = 0;
|
||||
pw.i[24] = 0;
|
||||
pw.i[25] = 0;
|
||||
pw.i[26] = 0;
|
||||
pw.i[27] = 0;
|
||||
pw.i[28] = 0;
|
||||
pw.i[29] = 0;
|
||||
pw.i[30] = 0;
|
||||
pw.i[31] = 0;
|
||||
pw.i[32] = 0;
|
||||
pw.i[33] = 0;
|
||||
pw.i[34] = 0;
|
||||
pw.i[35] = 0;
|
||||
pw.i[36] = 0;
|
||||
pw.i[37] = 0;
|
||||
pw.i[38] = 0;
|
||||
pw.i[39] = 0;
|
||||
pw.i[40] = 0;
|
||||
pw.i[41] = 0;
|
||||
pw.i[42] = 0;
|
||||
pw.i[43] = 0;
|
||||
pw.i[44] = 0;
|
||||
pw.i[45] = 0;
|
||||
pw.i[46] = 0;
|
||||
pw.i[47] = 0;
|
||||
pw.i[48] = 0;
|
||||
pw.i[49] = 0;
|
||||
pw.i[50] = 0;
|
||||
pw.i[51] = 0;
|
||||
pw.i[52] = 0;
|
||||
pw.i[53] = 0;
|
||||
pw.i[54] = 0;
|
||||
pw.i[55] = 0;
|
||||
pw.i[56] = 0;
|
||||
pw.i[57] = 0;
|
||||
pw.i[58] = 0;
|
||||
pw.i[59] = 0;
|
||||
pw.i[60] = 0;
|
||||
pw.i[61] = 0;
|
||||
pw.i[62] = 0;
|
||||
pw.i[63] = 0; // yep that's faster
|
||||
|
||||
//pw.pw_len = 1 + (l32 & 15);
|
||||
pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500)
|
||||
|
||||
buf[gid] = pw;
|
||||
}
|
@ -151,6 +151,7 @@
|
||||
- Kernel Cache: Reactivate OpenCL runtime specific kernel caches
|
||||
- Kernel Compile: Removed -cl-std= from all kernel build options since we're compatible to all OpenCL versions
|
||||
- OpenCL Kernels: Fix OpenCL compiler warning on double precision constants
|
||||
- OpenCL Kernels: Moved "gpu_decompress", "gpu_memset" and "gpu_atinit" into shared.cl in order to reduce compile time
|
||||
- OpenCL Options: Removed --opencl-platforms filter in order to force backend device numbers to stay constant
|
||||
- Parsers: switched from strtok() to strtok_r() for thread safety
|
||||
- Requirements: Add new requirement for NVIDIA GPU: CUDA Toolkit (10.1 or later)
|
||||
|
@ -134,12 +134,14 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *de
|
||||
int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
|
||||
int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 pws_cnt);
|
||||
|
||||
void generate_source_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file);
|
||||
void generate_cached_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *profile_dir, const char *device_name_chksum, char *cached_file);
|
||||
void generate_source_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file);
|
||||
void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum, char *cached_file);
|
||||
void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_dir, char *source_file);
|
||||
void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum, char *cached_file);
|
||||
void generate_source_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *shared_dir, char *source_file);
|
||||
void generate_cached_kernel_filename (const bool slow_candidates, const u32 attack_exec, const u32 attack_kern, const u32 kern_type, const u32 opti_type, char *profile_dir, const char *device_name_chksum, char *cached_file);
|
||||
void generate_source_kernel_shared_filename (char *shared_dir, char *source_file);
|
||||
void generate_cached_kernel_shared_filename (char *profile_dir, const char *device_name_chksum, char *cached_file);
|
||||
void generate_source_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file);
|
||||
void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum, char *cached_file);
|
||||
void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_dir, char *source_file);
|
||||
void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum, char *cached_file);
|
||||
|
||||
int backend_ctx_init (hashcat_ctx_t *hashcat_ctx);
|
||||
void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx);
|
||||
|
@ -1322,6 +1322,7 @@ typedef struct hc_device_param
|
||||
CUevent cuda_event2;
|
||||
|
||||
CUmodule cuda_module;
|
||||
CUmodule cuda_module_shared;
|
||||
CUmodule cuda_module_mp;
|
||||
CUmodule cuda_module_amp;
|
||||
|
||||
@ -1403,6 +1404,7 @@ typedef struct hc_device_param
|
||||
cl_command_queue opencl_command_queue;
|
||||
|
||||
cl_program opencl_program;
|
||||
cl_program opencl_program_shared;
|
||||
cl_program opencl_program_mp;
|
||||
cl_program opencl_program_amp;
|
||||
|
||||
|
@ -631,6 +631,16 @@ void generate_cached_kernel_filename (const bool slow_candidates, const u32 atta
|
||||
}
|
||||
}
|
||||
|
||||
void generate_source_kernel_shared_filename (char *shared_dir, char *source_file)
|
||||
{
|
||||
snprintf (source_file, 255, "%s/OpenCL/shared.cl", shared_dir);
|
||||
}
|
||||
|
||||
void generate_cached_kernel_shared_filename (char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file)
|
||||
{
|
||||
snprintf (cached_file, 255, "%s/kernels/shared.%s.kernel", profile_dir, device_name_chksum_amp_mp);
|
||||
}
|
||||
|
||||
void generate_source_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *shared_dir, char *source_file)
|
||||
{
|
||||
if ((opti_type & OPTI_TYPE_BRUTE_FORCE) && (opts_type & OPTS_TYPE_PT_GENERATE_BE))
|
||||
@ -7786,6 +7796,44 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
|
||||
|
||||
hcfree (build_options_module_buf);
|
||||
|
||||
/**
|
||||
* shared kernel with no hashconfig dependencies
|
||||
*/
|
||||
|
||||
{
|
||||
/**
|
||||
* kernel shared source filename
|
||||
*/
|
||||
|
||||
char source_file[256] = { 0 };
|
||||
|
||||
generate_source_kernel_shared_filename (folder_config->shared_dir, source_file);
|
||||
|
||||
if (hc_path_read (source_file) == false)
|
||||
{
|
||||
event_log_error (hashcat_ctx, "%s: %s", source_file, strerror (errno));
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* kernel shared cached filename
|
||||
*/
|
||||
|
||||
char cached_file[256] = { 0 };
|
||||
|
||||
generate_cached_kernel_shared_filename (folder_config->profile_dir, device_name_chksum_amp_mp, cached_file);
|
||||
|
||||
const bool rc_load_kernel = load_kernel (hashcat_ctx, device_param, "shared_kernel", source_file, cached_file, build_options_buf, cache_disable, &device_param->opencl_program_shared, &device_param->cuda_module_shared);
|
||||
|
||||
if (rc_load_kernel == false)
|
||||
{
|
||||
event_log_error (hashcat_ctx, "* Device #%u: Kernel %s build failed.", device_param->device_id + 1, source_file);
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* word generator kernel
|
||||
*/
|
||||
@ -8708,7 +8756,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
|
||||
|
||||
// GPU memset
|
||||
|
||||
if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_memset, device_param->cuda_module, "gpu_memset") == -1) return -1;
|
||||
if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_memset, device_param->cuda_module_shared, "gpu_memset") == -1) return -1;
|
||||
|
||||
if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_memset, &device_param->kernel_wgs_memset) == -1) return -1;
|
||||
|
||||
@ -8722,7 +8770,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
|
||||
|
||||
// GPU autotune init
|
||||
|
||||
if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_atinit, device_param->cuda_module, "gpu_atinit") == -1) return -1;
|
||||
if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_atinit, device_param->cuda_module_shared, "gpu_atinit") == -1) return -1;
|
||||
|
||||
if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_atinit, &device_param->kernel_wgs_atinit) == -1) return -1;
|
||||
|
||||
@ -8735,7 +8783,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
|
||||
|
||||
// GPU decompress
|
||||
|
||||
if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_decompress, device_param->cuda_module, "gpu_decompress") == -1) return -1;
|
||||
if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_decompress, device_param->cuda_module_shared, "gpu_decompress") == -1) return -1;
|
||||
|
||||
if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_decompress, &device_param->kernel_wgs_decompress) == -1) return -1;
|
||||
|
||||
@ -9243,7 +9291,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
|
||||
|
||||
// GPU memset
|
||||
|
||||
if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, "gpu_memset", &device_param->opencl_kernel_memset) == -1) return -1;
|
||||
if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_memset", &device_param->opencl_kernel_memset) == -1) return -1;
|
||||
|
||||
if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_wgs_memset) == -1) return -1;
|
||||
|
||||
@ -9257,7 +9305,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
|
||||
|
||||
// GPU autotune init
|
||||
|
||||
if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1;
|
||||
if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1;
|
||||
|
||||
if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_atinit, &device_param->kernel_wgs_atinit) == -1) return -1;
|
||||
|
||||
@ -9270,7 +9318,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
|
||||
|
||||
// GPU decompress
|
||||
|
||||
if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, "gpu_decompress", &device_param->opencl_kernel_decompress) == -1) return -1;
|
||||
if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_decompress", &device_param->opencl_kernel_decompress) == -1) return -1;
|
||||
|
||||
if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_decompress, &device_param->kernel_wgs_decompress) == -1) return -1;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user