OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count

pull/1518/head
Jens Steube 6 years ago
parent a7a15df911
commit d656e9c3a4

@ -61724,7 +61724,8 @@ __kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max)
pw.i[62] = 0;
pw.i[63] = 0; // yep that's faster
pw.pw_len = 1 + (l32 & 15);
//pw.pw_len = 1 + (l32 & 15);
pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500)
buf[gid] = pw;
}

@ -63,6 +63,7 @@
- OpenCL Kernels: Add general function declaration keyword (inline) and some OpenCL runtime specific exceptions for NV and CPU devices
- OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space
- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements, to reduce startup time
- OpenCL Kernels: Use the kernel local buffer size as additional reference in order to limit the thread-count
- OpenCL Kernels: Thread-count is switched from native to maximum - as a consequence we assume host memory pool of 2GB per GPU
- OpenCL Runtime: Add current timestamp to OpenCL kernel source in order to force OpenCL JiT compiler to recompile and not use the cache
- OpenCL Runtime: Enforce to use OpenCL version 1.2 to restrain OpenCL runtimes to make use of the __generic address space qualifier

@ -61,4 +61,7 @@ bool hc_same_files (char *file1, char *file2);
u32 hc_strtoul (const char *nptr, char **endptr, int base);
u64 hc_strtoull (const char *nptr, char **endptr, int base);
u32 power_of_two_ceil_32 (const u32 v);
u32 power_of_two_floor_32 (const u32 v);
#endif // _SHARED_H

@ -138,14 +138,19 @@ typedef enum amplifier_count
KERNEL_BFS = 1024,
KERNEL_COMBS = 1024,
KERNEL_RULES = 256,
KERNEL_THREADS_MAX_CPU = 1,
KERNEL_THREADS_MAX_GPU = 8, // ex: intel integrated
KERNEL_THREADS_MAX_GPU_NV = 32, // optimized NV size: warps
KERNEL_THREADS_MAX_GPU_AMD = 64, // optimized AMD size: wavefronts
KERNEL_THREADS_MAX_OTHER = 8, // ex: intel MIC
} amplifier_count_t;
typedef enum native_threads
{
KERNEL_THREADS_NATIVE_CPU = 1,
KERNEL_THREADS_NATIVE_GPU = 8, // ex: intel integrated
KERNEL_THREADS_NATIVE_GPU_NV = 32, // optimized NV size: warps
KERNEL_THREADS_NATIVE_GPU_AMD = 64, // optimized AMD size: wavefronts
KERNEL_THREADS_NATIVE_OTHER = 8, // ex: intel MIC
} native_threads_t;
typedef enum vendor_id
{
VENDOR_ID_AMD = (1 << 0),
@ -923,6 +928,7 @@ typedef struct hc_device_param
u64 device_global_mem;
u32 device_maxclock_frequency;
size_t device_maxworkgroup_size;
u64 device_local_mem_size;
u32 vector_width;

@ -351,11 +351,11 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
return -1;
}
// we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
// but sometimes, when a benchmark kernel run cracks a kernel, we don't want to see that!
if (user_options->speed_only == true)
{
// we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
// however if the benchmark cracks the artificial hash used for benchmarks we don't want to see that!
return 0;
}

@ -25970,6 +25970,66 @@ u32 hashconfig_forced_kernel_threads (hashcat_ctx_t *hashcat_ctx)
return kernel_threads;
}
u32 hashconfig_limited_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
{
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
u32 kernel_threads = 0;
// sometimes there's a high kernel requirement for local memory (which is multiplied with threads)
u32 local_mem_per_thread = 0;
// basically the sum of all .local space of the _loop kernel
// see .ptx
if (hashconfig->hash_mode == 1800) local_mem_per_thread = 1024;
if (hashconfig->hash_mode == 12500) local_mem_per_thread = 3296;
if (hashconfig->hash_mode == 13400) local_mem_per_thread = 5360;
if (local_mem_per_thread)
{
const u32 device_local_mem_size = (const u32) device_param->device_local_mem_size;
kernel_threads = device_local_mem_size / local_mem_per_thread;
// there can be some very unaligned results from this, therefore round it down to next power of two
kernel_threads = power_of_two_floor_32 (kernel_threads);
}
// make sure to not underpower
if (kernel_threads)
{
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
}
else if (device_param->device_type & CL_DEVICE_TYPE_GPU)
{
if (device_param->device_vendor_id == VENDOR_ID_NV)
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_NV);
}
else if (device_param->device_vendor_id == VENDOR_ID_AMD)
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU_AMD);
}
else
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_GPU);
}
}
else
{
kernel_threads = MAX (kernel_threads, KERNEL_THREADS_NATIVE_OTHER);
}
}
return kernel_threads;
}
u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_param_t *device_param)
{
const user_options_t *user_options = hashcat_ctx->user_options;
@ -25980,13 +26040,22 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
if (forced_kernel_threads) return forced_kernel_threads;
// otherwise it depends on the opencl device type
// it can also depends on the opencl device type
u32 kernel_threads = (const u32) device_param->device_maxworkgroup_size;
u32 kernel_threads = (u32) device_param->device_maxworkgroup_size;
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
{
kernel_threads = MIN (kernel_threads, KERNEL_THREADS_MAX_CPU);
kernel_threads = MIN (kernel_threads, KERNEL_THREADS_NATIVE_CPU);
}
// or if it requires for example a lot of local memory
const u32 limited_kernel_threads = hashconfig_limited_kernel_threads (hashcat_ctx, device_param);
if (limited_kernel_threads)
{
kernel_threads = MIN (kernel_threads, limited_kernel_threads);
}
return kernel_threads;
@ -25994,7 +26063,7 @@ u32 hashconfig_get_kernel_threads (hashcat_ctx_t *hashcat_ctx, const hc_device_p
u32 hashconfig_get_kernel_loops (hashcat_ctx_t *hashcat_ctx)
{
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
const user_options_t *user_options = hashcat_ctx->user_options;
u32 kernel_loops_fixed = 0;

@ -3306,6 +3306,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
device_param->skipped = true;
}
device_param->device_local_mem_size = device_local_mem_size;
// If there's both an Intel CPU and an AMD OpenCL runtime it's a tricky situation
// Both platforms support CPU device types and therefore both will try to use 100% of the physical resources
// This results in both utilizing it for 50%

@ -506,3 +506,32 @@ u64 hc_strtoull (const char *nptr, char **endptr, int base)
{
return (u64) strtoull (nptr, endptr, base);
}
u32 power_of_two_ceil_32 (const u32 v)
{
u32 r = v;
r--;
r |= r >> 1;
r |= r >> 2;
r |= r >> 4;
r |= r >> 8;
r |= r >> 16;
r++;
return r;
}
u32 power_of_two_floor_32 (const u32 v)
{
u32 r = power_of_two_ceil_32 (v);
if (r > v)
{
r >>= 1;
}
return r;
}

@ -1389,15 +1389,17 @@ double status_get_hashes_msec_dev (const hashcat_ctx_t *hashcat_ctx, const int d
if (device_param->skipped == false)
{
for (int i = 0; i < SPEED_CACHE; i++)
const u32 speed_pos = device_param->speed_pos;
for (int i = 0; i < speed_pos; i++)
{
speed_cnt += device_param->speed_cnt[i];
speed_msec += device_param->speed_msec[i];
}
}
speed_cnt /= SPEED_CACHE;
speed_msec /= SPEED_CACHE;
speed_cnt /= speed_pos;
speed_msec /= speed_pos;
}
double hashes_dev_msec = 0;

Loading…
Cancel
Save