1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-11-22 16:18:09 +00:00

OpenCL Kernels: Refactored kernel thread management from native to maximum per kernel

This commit is contained in:
jsteube 2018-02-11 21:29:57 +01:00
parent d38608b9bc
commit c4f30220a0
8 changed files with 403 additions and 467 deletions

View File

@ -63,8 +63,7 @@
- OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
- OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
- OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count
- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
- OpenCL Kernels: Refactored kernel thread management from native to maximum per kernel
- OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
- OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier
- OpenCL Runtime: Updated rocm detection

View File

@ -1975,7 +1975,6 @@ int ascii_digest (hashcat_ctx_t *hashcat_ctx, char *out_buf, const size_t out_le
int hashconfig_init (hashcat_ctx_t *hashcat_ctx);
void hashconfig_destroy (hashcat_ctx_t *hashcat_ctx);
u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx);
u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param);
u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param);
u32 hashconfig_get_kernel_loops (hashcat_ctx_t *hashcat_ctx);
int hashconfig_general_defaults (hashcat_ctx_t *hashcat_ctx);

View File

@ -141,16 +141,6 @@ typedef enum amplifier_count
} amplifier_count_t;
typedef enum native_threads
{
KERNEL_THREADS_NATIVE_CPU = 1,
KERNEL_THREADS_NATIVE_GPU = 8, // ex: intel integrated
KERNEL_THREADS_NATIVE_GPU_NV = 32, // optimized NV size: warps
KERNEL_THREADS_NATIVE_GPU_AMD = 64, // optimized AMD size: wavefronts
KERNEL_THREADS_NATIVE_OTHER = 8, // ex: intel MIC
} native_threads_t;
typedef enum vendor_id
{
VENDOR_ID_AMD = (1 << 0),
@ -932,24 +922,22 @@ typedef struct hc_device_param
u32 vector_width;
u32 kernel_threads_by_user;
u32 kernel_threads_by_wgs_kernel1;
u32 kernel_threads_by_wgs_kernel12;
u32 kernel_threads_by_wgs_kernel2;
u32 kernel_threads_by_wgs_kernel23;
u32 kernel_threads_by_wgs_kernel3;
u32 kernel_threads_by_wgs_kernel4;
u32 kernel_threads_by_wgs_kernel_init2;
u32 kernel_threads_by_wgs_kernel_loop2;
u32 kernel_threads_by_wgs_kernel_mp;
u32 kernel_threads_by_wgs_kernel_mp_l;
u32 kernel_threads_by_wgs_kernel_mp_r;
u32 kernel_threads_by_wgs_kernel_amp;
u32 kernel_threads_by_wgs_kernel_tm;
u32 kernel_threads_by_wgs_kernel_memset;
u32 kernel_threads_by_wgs_kernel_atinit;
u32 kernel_threads_by_wgs_kernel_decompress;
u32 kernel_wgs1;
u32 kernel_wgs12;
u32 kernel_wgs2;
u32 kernel_wgs23;
u32 kernel_wgs3;
u32 kernel_wgs4;
u32 kernel_wgs_init2;
u32 kernel_wgs_loop2;
u32 kernel_wgs_mp;
u32 kernel_wgs_mp_l;
u32 kernel_wgs_mp_r;
u32 kernel_wgs_amp;
u32 kernel_wgs_tm;
u32 kernel_wgs_memset;
u32 kernel_wgs_atinit;
u32 kernel_wgs_decompress;
u32 kernel_preferred_wgs_multiple1;
u32 kernel_preferred_wgs_multiple12;
@ -985,6 +973,8 @@ typedef struct hc_device_param
u64 kernel_local_mem_size_atinit;
u64 kernel_local_mem_size_decompress;
u32 kernel_threads;
u32 kernel_accel;
u32 kernel_accel_prev;
u32 kernel_accel_min;

View File

@ -19,25 +19,21 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set
device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
const u32 kernel_power_try = device_param->hardware_power * kernel_accel;
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
{
const u32 kernel_power_try = device_param->device_processors * device_param->kernel_threads_by_wgs_kernel1 * kernel_accel;
run_kernel (hashcat_ctx, device_param, KERN_RUN_1, kernel_power_try, true, 0);
}
else
{
const u32 kernel_power_try = device_param->device_processors * device_param->kernel_threads_by_wgs_kernel4 * kernel_accel;
run_kernel (hashcat_ctx, device_param, KERN_RUN_4, kernel_power_try, true, 0);
}
}
else
{
const u32 kernel_power_try = device_param->device_processors * device_param->kernel_threads_by_wgs_kernel2 * kernel_accel;
run_kernel (hashcat_ctx, device_param, KERN_RUN_2, kernel_power_try, true, 0);
}
@ -89,7 +85,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
device_param->kernel_accel = kernel_accel;
device_param->kernel_loops = kernel_loops;
const u32 kernel_power = device_param->device_processors * device_param->kernel_threads_by_user * device_param->kernel_accel;
const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel;
device_param->kernel_power = kernel_power;
@ -99,7 +95,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
// from here it's clear we are allowed to autotune
// so let's init some fake words
const u32 kernel_power_max = device_param->device_processors * device_param->kernel_threads_by_user * kernel_accel_max;
const u32 kernel_power_max = device_param->hardware_power * kernel_accel_max;
int CL_rc;
@ -258,7 +254,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
device_param->kernel_accel = kernel_accel;
device_param->kernel_loops = kernel_loops;
const u32 kernel_power = device_param->device_processors * device_param->kernel_threads_by_user * device_param->kernel_accel;
const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel;
device_param->kernel_power = kernel_power;

View File

@ -25970,68 +25970,10 @@ u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx)
return kernel_threads;
}
u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
{
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
u32 kernel_threads = 0;
// sometimes there's a high kernel requirement for local memory (which is multiplied with threads)
u32 local_mem_per_thread = 0;
// basically the sum of all .local space of the _loop kernel
// see .ptx
if (hashconfig->hash_mode == 1800) local_mem_per_thread = 1024;
if (hashconfig->hash_mode == 12500) local_mem_per_thread = 3296;
if (hashconfig->hash_mode == 13400) local_mem_per_thread = 5360;
if (local_mem_per_thread)
{
const u32 device_local_mem_size = (const u32) device_param->device_local_mem_size;
kernel_threads = device_local_mem_size / local_mem_per_thread;
// there can be some very unaligned results from this, therefore round it down to next power of two
kernel_threads = power_of_two_floor_32 (kernel_threads);
}
// make sure to not underpower
if (kernel_threads)
{
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
}
else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
{
if (device_param->device_vendor_id == VENDOR_ID_NV)
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_NV);
}
else if (device_param->device_vendor_id == VENDOR_ID_AMD)
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_AMD);
}
else
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU);
}
}
else
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_OTHER);
}
}
return kernel_threads;
}
u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
{
const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
// a kernel can force a fixed value
const u32 forced_kernel_threads = hashconfig_forced_kernel_threads (hashcat_ctx);
@ -26042,19 +25984,32 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
u32 kernel_threads = (u32) device_param->device_maxworkgroup_size;
// for CPU we use a special path
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
{
kernel_threads = MIN (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
if (device_param->kernel_preferred_wgs_multiple1) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple1);
if (device_param->kernel_preferred_wgs_multiple2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple2);
if (device_param->kernel_preferred_wgs_multiple3) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple3);
if (device_param->kernel_preferred_wgs_multiple4) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple4);
if (device_param->kernel_preferred_wgs_multiple_tm) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple_tm);
}
else
{
if (device_param->kernel_preferred_wgs_multiple1) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple1);
if (device_param->kernel_preferred_wgs_multiple2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple2);
if (device_param->kernel_preferred_wgs_multiple3) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple3);
if (device_param->kernel_preferred_wgs_multiple12) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple12);
if (device_param->kernel_preferred_wgs_multiple23) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple23);
if (device_param->kernel_preferred_wgs_multiple_init2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple_init2);
if (device_param->kernel_preferred_wgs_multiple_loop2) kernel_threads = MIN (kernel_threads, device_param->kernel_preferred_wgs_multiple_loop2);
}
}
// or if it requires for example a lot of local memory
const u32 limited_kernel_threads = hashconfig_limited_kernel_threads (hashcat_ctx, device_param);
if (limited_kernel_threads)
{
kernel_threads = MIN (kernel_threads, limited_kernel_threads);
}
// we'll return a number power of two, makes future processing much more easy
// kernel_threads = power_of_two_floor_32 (kernel_threads);
return kernel_threads;
}

File diff suppressed because it is too large Load Diff

View File

@ -35,35 +35,6 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
device_param->kernel_params_buf32[31] = 1;
device_param->kernel_params_buf32[32] = 0;
const u32 kernel_threads_by_wgs_kernel1_sav = device_param->kernel_threads_by_wgs_kernel1;
const u32 kernel_threads_by_wgs_kernel12_sav = device_param->kernel_threads_by_wgs_kernel12;
const u32 kernel_threads_by_wgs_kernel2_sav = device_param->kernel_threads_by_wgs_kernel2;
const u32 kernel_threads_by_wgs_kernel23_sav = device_param->kernel_threads_by_wgs_kernel23;
const u32 kernel_threads_by_wgs_kernel3_sav = device_param->kernel_threads_by_wgs_kernel3;
const u32 kernel_threads_by_wgs_kernel4_sav = device_param->kernel_threads_by_wgs_kernel4;
const u32 kernel_threads_by_wgs_kernel_init2_sav = device_param->kernel_threads_by_wgs_kernel_init2;
const u32 kernel_threads_by_wgs_kernel_loop2_sav = device_param->kernel_threads_by_wgs_kernel_loop2;
if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options_extra->attack_kern == ATTACK_KERN_BF))
{
// do nothing
}
else
{
// there's a few algorithm that force a fixed thread size but are not listed in hashconfig_forced_kernel_threads()
// because it's not a global fixed thread, just a single one on a single kernel
// if it wants to run at 8 and we set it to 1 it will return CL_INVALID_WORK_GROUP_SIZE
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel1) device_param->kernel_threads_by_wgs_kernel1 = 1;
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel12) device_param->kernel_threads_by_wgs_kernel12 = 1;
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel2) device_param->kernel_threads_by_wgs_kernel2 = 1;
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel23) device_param->kernel_threads_by_wgs_kernel23 = 1;
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel3) device_param->kernel_threads_by_wgs_kernel3 = 1;
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel4) device_param->kernel_threads_by_wgs_kernel4 = 1;
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel_init2) device_param->kernel_threads_by_wgs_kernel_init2 = 1;
if (device_param->kernel_threads_by_user == device_param->kernel_threads_by_wgs_kernel_loop2) device_param->kernel_threads_by_wgs_kernel_loop2 = 1;
}
// password : move the known password into a fake buffer
u32 highest_pw_len = 0;
@ -458,15 +429,6 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
// finish : cleanup and restore
device_param->kernel_threads_by_wgs_kernel1 = kernel_threads_by_wgs_kernel1_sav;
device_param->kernel_threads_by_wgs_kernel12 = kernel_threads_by_wgs_kernel12_sav;
device_param->kernel_threads_by_wgs_kernel2 = kernel_threads_by_wgs_kernel2_sav;
device_param->kernel_threads_by_wgs_kernel23 = kernel_threads_by_wgs_kernel23_sav;
device_param->kernel_threads_by_wgs_kernel3 = kernel_threads_by_wgs_kernel3_sav;
device_param->kernel_threads_by_wgs_kernel4 = kernel_threads_by_wgs_kernel4_sav;
device_param->kernel_threads_by_wgs_kernel_init2 = kernel_threads_by_wgs_kernel_init2_sav;
device_param->kernel_threads_by_wgs_kernel_loop2 = kernel_threads_by_wgs_kernel_loop2_sav;
device_param->kernel_params_buf32[27] = 0;
device_param->kernel_params_buf32[28] = 0;
device_param->kernel_params_buf32[29] = 0;

View File

@ -1389,7 +1389,7 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d
if (device_param->skipped == false)
{
const u32 speed_pos = device_param->speed_pos;
const u32 speed_pos = MAX (device_param->speed_pos, 1);
for (u32 i = 0; i < speed_pos; i++)
{
@ -1817,7 +1817,7 @@ int status_get_kernel_threads_dev (const hashcat_ctx_t *hashcat_ctx, const int d
if (device_param->skipped == true) return 0;
return device_param->kernel_threads_by_user;
return device_param->kernel_threads;
}
int status_get_vector_width_dev (const hashcat_ctx_t *hashcat_ctx, const int device_id)