mirror of
https://github.com/hashcat/hashcat.git
synced 2024-11-22 16:18:09 +00:00
OpenCL Kernels: Refactored kernel thread management from native to maximum per kernel
This commit is contained in:
parent
d38608b9bc
commit
c4f30220a0
@ -63,8 +63,7 @@
|
||||
- OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
|
||||
- OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
|
||||
- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
|
||||
- OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count
|
||||
- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
|
||||
- OpenCL Kernels: Refactored kernel thread management from native to maximum per kernel
|
||||
- OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
|
||||
- OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
|
||||
- OpenCL Runtime: Updated rocm detection
|
||||
|
@ -1975,7 +1975,6 @@ int ascii_digest (hashcat_ctx_t *hashcat_ctx, char *out_buf, const size_t out_le
|
||||
int hashconfig_init (hashcat_ctx_t *hashcat_ctx);
|
||||
void hashconfig_destroy (hashcat_ctx_t *hashcat_ctx);
|
||||
u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx);
|
||||
u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param);
|
||||
u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param);
|
||||
u32 hashconfig_get_kernel_loops (hashcat_ctx_t *hashcat_ctx);
|
||||
int hashconfig_general_defaults (hashcat_ctx_t *hashcat_ctx);
|
||||
|
@ -141,16 +141,6 @@ typedef enum amplifier_count
|
||||
|
||||
} amplifier_count_t;
|
||||
|
||||
typedef enum native_threads
|
||||
{
|
||||
KERNEL_THREADS_NATIVE_CPU = 1,
|
||||
KERNEL_THREADS_NATIVE_GPU = 8, // ex: intel integrated
|
||||
KERNEL_THREADS_NATIVE_GPU_NV = 32, // optimized NV size: warps
|
||||
KERNEL_THREADS_NATIVE_GPU_AMD = 64, // optimized AMD size: wavefronts
|
||||
KERNEL_THREADS_NATIVE_OTHER = 8, // ex: intel MIC
|
||||
|
||||
} native_threads_t;
|
||||
|
||||
typedef enum vendor_id
|
||||
{
|
||||
VENDOR_ID_AMD = (1 << 0),
|
||||
@ -932,24 +922,22 @@ typedef struct hc_device_param
|
||||
|
||||
u32 vector_width;
|
||||
|
||||
u32 kernel_threads_by_user;
|
||||
|
||||
u32 kernel_threads_by_wgs_kernel1;
|
||||
u32 kernel_threads_by_wgs_kernel12;
|
||||
u32 kernel_threads_by_wgs_kernel2;
|
||||
u32 kernel_threads_by_wgs_kernel23;
|
||||
u32 kernel_threads_by_wgs_kernel3;
|
||||
u32 kernel_threads_by_wgs_kernel4;
|
||||
u32 kernel_threads_by_wgs_kernel_init2;
|
||||
u32 kernel_threads_by_wgs_kernel_loop2;
|
||||
u32 kernel_threads_by_wgs_kernel_mp;
|
||||
u32 kernel_threads_by_wgs_kernel_mp_l;
|
||||
u32 kernel_threads_by_wgs_kernel_mp_r;
|
||||
u32 kernel_threads_by_wgs_kernel_amp;
|
||||
u32 kernel_threads_by_wgs_kernel_tm;
|
||||
u32 kernel_threads_by_wgs_kernel_memset;
|
||||
u32 kernel_threads_by_wgs_kernel_atinit;
|
||||
u32 kernel_threads_by_wgs_kernel_decompress;
|
||||
u32 kernel_wgs1;
|
||||
u32 kernel_wgs12;
|
||||
u32 kernel_wgs2;
|
||||
u32 kernel_wgs23;
|
||||
u32 kernel_wgs3;
|
||||
u32 kernel_wgs4;
|
||||
u32 kernel_wgs_init2;
|
||||
u32 kernel_wgs_loop2;
|
||||
u32 kernel_wgs_mp;
|
||||
u32 kernel_wgs_mp_l;
|
||||
u32 kernel_wgs_mp_r;
|
||||
u32 kernel_wgs_amp;
|
||||
u32 kernel_wgs_tm;
|
||||
u32 kernel_wgs_memset;
|
||||
u32 kernel_wgs_atinit;
|
||||
u32 kernel_wgs_decompress;
|
||||
|
||||
u32 kernel_preferred_wgs_multiple1;
|
||||
u32 kernel_preferred_wgs_multiple12;
|
||||
@ -985,6 +973,8 @@ typedef struct hc_device_param
|
||||
u64 kernel_local_mem_size_atinit;
|
||||
u64 kernel_local_mem_size_decompress;
|
||||
|
||||
u32 kernel_threads;
|
||||
|
||||
u32 kernel_accel;
|
||||
u32 kernel_accel_prev;
|
||||
u32 kernel_accel_min;
|
||||
|
@ -19,25 +19,21 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
|
||||
device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set
|
||||
device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
|
||||
|
||||
const u32 kernel_power_try = device_param->hardware_power * kernel_accel;
|
||||
|
||||
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
|
||||
{
|
||||
if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
|
||||
{
|
||||
const u32 kernel_power_try = device_param->device_processors * device_param->kernel_threads_by_wgs_kernel1 * kernel_accel;
|
||||
|
||||
run_kernel (hashcat_ctx, device_param, KERN_RUN_1, kernel_power_try, true, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
const u32 kernel_power_try = device_param->device_processors * device_param->kernel_threads_by_wgs_kernel4 * kernel_accel;
|
||||
|
||||
run_kernel (hashcat_ctx, device_param, KERN_RUN_4, kernel_power_try, true, 0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const u32 kernel_power_try = device_param->device_processors * device_param->kernel_threads_by_wgs_kernel2 * kernel_accel;
|
||||
|
||||
run_kernel (hashcat_ctx, device_param, KERN_RUN_2, kernel_power_try, true, 0);
|
||||
}
|
||||
|
||||
@ -89,7 +85,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
|
||||
device_param->kernel_accel = kernel_accel;
|
||||
device_param->kernel_loops = kernel_loops;
|
||||
|
||||
const u32 kernel_power = device_param->device_processors * device_param->kernel_threads_by_user * device_param->kernel_accel;
|
||||
const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel;
|
||||
|
||||
device_param->kernel_power = kernel_power;
|
||||
|
||||
@ -99,7 +95,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
|
||||
// from here it's clear we are allowed to autotune
|
||||
// so let's init some fake words
|
||||
|
||||
const u32 kernel_power_max = device_param->device_processors * device_param->kernel_threads_by_user * kernel_accel_max;
|
||||
const u32 kernel_power_max = device_param->hardware_power * kernel_accel_max;
|
||||
|
||||
int CL_rc;
|
||||
|
||||
@ -258,7 +254,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
|
||||
device_param->kernel_accel = kernel_accel;
|
||||
device_param->kernel_loops = kernel_loops;
|
||||
|
||||
const u32 kernel_power = device_param->device_processors * device_param->kernel_threads_by_user * device_param->kernel_accel;
|
||||
const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel;
|
||||
|
||||
device_param->kernel_power = kernel_power;
|
||||
|
||||
|
@ -25970,68 +25970,10 @@ u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx)
|
||||
return kernel_threads;
|
||||
}
|
||||
|
||||
u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
|
||||
{
|
||||
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
|
||||
|
||||
u32 kernel_threads = 0;
|
||||
|
||||
// sometimes there's a high kernel requirement for local memory (which is multiplied with threads)
|
||||
|
||||
u32 local_mem_per_thread = 0;
|
||||
|
||||
// basically the sum of all .local space of the _loop kernel
|
||||
// see .ptx
|
||||
|
||||
if (hashconfig->hash_mode == 1800) local_mem_per_thread = 1024;
|
||||
if (hashconfig->hash_mode == 12500) local_mem_per_thread = 3296;
|
||||
if (hashconfig->hash_mode == 13400) local_mem_per_thread = 5360;
|
||||
|
||||
if (local_mem_per_thread)
|
||||
{
|
||||
const u32 device_local_mem_size = (const u32) device_param->device_local_mem_size;
|
||||
|
||||
kernel_threads = device_local_mem_size / local_mem_per_thread;
|
||||
|
||||
// there can be some very unaligned results from this, therefore round it down to next power of two
|
||||
|
||||
kernel_threads = power_of_two_floor_32 (kernel_threads);
|
||||
}
|
||||
|
||||
// make sure to not underpower
|
||||
|
||||
if (kernel_threads)
|
||||
{
|
||||
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
|
||||
}
|
||||
else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
|
||||
{
|
||||
if (device_param->device_vendor_id == VENDOR_ID_NV)
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_NV);
|
||||
}
|
||||
else if (device_param->device_vendor_id == VENDOR_ID_AMD)
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_AMD);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_OTHER);
|
||||
}
|
||||
}
|
||||
|
||||
return kernel_threads;
|
||||
}
|
||||
|
||||
u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
|
||||
{
|
||||
const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
|
||||
|
||||
// a kernel can force a fixed value
|
||||
|
||||
const u32 forced_kernel_threads = hashconfig_forced_kernel_threads (hashcat_ctx);
|
||||
@ -26042,19 +25984,32 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
|
||||
|
||||
u32 kernel_threads = (u32) device_param->device_maxworkgroup_size;
|
||||
|
||||
// for CPU we use a special path
|
||||
|
||||
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
|
||||
{
|
||||
kernel_threads = MIN (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
|
||||
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
|
||||
{
|
||||
if (device_param->kernel_preferred_wgs_multiple1) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple1);
|
||||
if (device_param->kernel_preferred_wgs_multiple2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple2);
|
||||
if (device_param->kernel_preferred_wgs_multiple3) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple3);
|
||||
if (device_param->kernel_preferred_wgs_multiple4) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple4);
|
||||
if (device_param->kernel_preferred_wgs_multiple_tm) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple_tm);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (device_param->kernel_preferred_wgs_multiple1) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple1);
|
||||
if (device_param->kernel_preferred_wgs_multiple2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple2);
|
||||
if (device_param->kernel_preferred_wgs_multiple3) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple3);
|
||||
if (device_param->kernel_preferred_wgs_multiple12) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple12);
|
||||
if (device_param->kernel_preferred_wgs_multiple23) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple23);
|
||||
if (device_param->kernel_preferred_wgs_multiple_init2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple_init2);
|
||||
if (device_param->kernel_preferred_wgs_multiple_loop2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple_loop2);
|
||||
}
|
||||
}
|
||||
|
||||
// or if it requires for example a lot of local memory
|
||||
|
||||
const u32 limited_kernel_threads = hashconfig_limited_kernel_threads (hashcat_ctx, device_param);
|
||||
|
||||
if (limited_kernel_threads)
|
||||
{
|
||||
kernel_threads = MIN (kernel_threads, limited_kernel_threads);
|
||||
}
|
||||
// we'll return a number power of two, makes future processing much more easy
|
||||
// kernel_threads = power_of_two_floor_32 (kernel_threads);
|
||||
|
||||
return kernel_threads;
|
||||
}
|
||||
|
671
src/opencl.c
671
src/opencl.c
File diff suppressed because it is too large
Load Diff
@ -35,35 +35,6 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
|
||||
device_param->kernel_params_buf32[31] = 1;
|
||||
device_param->kernel_params_buf32[32] = 0;
|
||||
|
||||
const u32 kernel_threads_by_wgs_kernel1_sav = device_param->kernel_threads_by_wgs_kernel1;
|
||||
const u32 kernel_threads_by_wgs_kernel12_sav = device_param->kernel_threads_by_wgs_kernel12;
|
||||
const u32 kernel_threads_by_wgs_kernel2_sav = device_param->kernel_threads_by_wgs_kernel2;
|
||||
const u32 kernel_threads_by_wgs_kernel23_sav = device_param->kernel_threads_by_wgs_kernel23;
|
||||
const u32 kernel_threads_by_wgs_kernel3_sav = device_param->kernel_threads_by_wgs_kernel3;
|
||||
const u32 kernel_threads_by_wgs_kernel4_sav = device_param->kernel_threads_by_wgs_kernel4;
|
||||
const u32 kernel_threads_by_wgs_kernel_init2_sav = device_param->kernel_threads_by_wgs_kernel_init2;
|
||||
const u32 kernel_threads_by_wgs_kernel_loop2_sav = device_param->kernel_threads_by_wgs_kernel_loop2;
|
||||
|
||||
if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options_extra->attack_kern == ATTACK_KERN_BF))
|
||||
{
|
||||
// do nothing
|
||||
}
|
||||
else
|
||||
{
|
||||
// there's a few algorithm that force a fixed thread size but are not listed in hashconfig_forced_kernel_threads()
|
||||
// because it's not a global fixed thread, just a single one on a single kernel
|
||||
// if it wants to run at 8 and we set it to 1 it will return CL_INVALID_WORK_GROUP_SIZE
|
||||
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel1) device_param->kernel_threads_by_wgs_kernel1 = 1;
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel12) device_param->kernel_threads_by_wgs_kernel12 = 1;
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel2) device_param->kernel_threads_by_wgs_kernel2 = 1;
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel23) device_param->kernel_threads_by_wgs_kernel23 = 1;
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel3) device_param->kernel_threads_by_wgs_kernel3 = 1;
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel4) device_param->kernel_threads_by_wgs_kernel4 = 1;
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel_init2) device_param->kernel_threads_by_wgs_kernel_init2 = 1;
|
||||
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel_loop2) device_param->kernel_threads_by_wgs_kernel_loop2 = 1;
|
||||
}
|
||||
|
||||
// password : move the known password into a fake buffer
|
||||
|
||||
u32 highest_pw_len = 0;
|
||||
@ -458,15 +429,6 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
|
||||
|
||||
// finish : cleanup and restore
|
||||
|
||||
device_param->kernel_threads_by_wgs_kernel1 = kernel_threads_by_wgs_kernel1_sav;
|
||||
device_param->kernel_threads_by_wgs_kernel12 = kernel_threads_by_wgs_kernel12_sav;
|
||||
device_param->kernel_threads_by_wgs_kernel2 = kernel_threads_by_wgs_kernel2_sav;
|
||||
device_param->kernel_threads_by_wgs_kernel23 = kernel_threads_by_wgs_kernel23_sav;
|
||||
device_param->kernel_threads_by_wgs_kernel3 = kernel_threads_by_wgs_kernel3_sav;
|
||||
device_param->kernel_threads_by_wgs_kernel4 = kernel_threads_by_wgs_kernel4_sav;
|
||||
device_param->kernel_threads_by_wgs_kernel_init2 = kernel_threads_by_wgs_kernel_init2_sav;
|
||||
device_param->kernel_threads_by_wgs_kernel_loop2 = kernel_threads_by_wgs_kernel_loop2_sav;
|
||||
|
||||
device_param->kernel_params_buf32[27] = 0;
|
||||
device_param->kernel_params_buf32[28] = 0;
|
||||
device_param->kernel_params_buf32[29] = 0;
|
||||
|
@ -1389,7 +1389,7 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d
|
||||
|
||||
if (device_param->skipped == false)
|
||||
{
|
||||
const u32 speed_pos = device_param->speed_pos;
|
||||
const u32 speed_pos = MAX (device_param->speed_pos, 1);
|
||||
|
||||
for (u32 i = 0; i < speed_pos; i++)
|
||||
{
|
||||
@ -1817,7 +1817,7 @@ int status_get_kernel_threads_dev (const hashcat_ctx_t *hashcat_ctx, const int d
|
||||
|
||||
if (device_param->skipped == true) return 0;
|
||||
|
||||
return device_param->kernel_threads_by_user;
|
||||
return device_param->kernel_threads;
|
||||
}
|
||||
|
||||
int status_get_vector_width_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)
|
||||
|
Loading…
Reference in New Issue
Block a user