1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-11-29 11:28:15 +00:00

AMD GPUs: On Apple OpenCL platform, we ask for the preferred kernel thread size rather than hard-coding 32

ECC secp256k1: Removed the inline assembly code for AMD GPUs because the latest JIT compilers optimize it with the same efficiency
This commit is contained in:
Jens Steube 2021-07-27 09:37:31 +02:00
parent 7f419c68af
commit fd2cb59d26
2 changed files with 23 additions and 2 deletions

View File

@ -124,7 +124,9 @@ DECLSPEC u32 sub (u32 *r, const u32 *a, const u32 *b)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
// HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
//#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
#elif 0
__asm__ __volatile__
(
"V_SUB_U32 %0, %9, %17;"
@ -176,7 +178,9 @@ DECLSPEC u32 add (u32 *r, const u32 *a, const u32 *b)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VADD == 1 && HAS_VADDC == 1
// HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
//#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
#elif 0
__asm__ __volatile__
(
"V_ADD_U32 %0, %9, %17;"

View File

@ -9366,6 +9366,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
{
if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
{
// from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_device_attribute_query.txt
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
// crazy, but apple does not support this query!
// the best alternative is "Preferred work group size multiple (kernel)", but requires to specify a kernel.
// so we will set kernel_preferred_wgs_multiple intentionally to 0 because otherwise it it set to 8 by default.
// we then assign the value kernel_preferred_wgs_multiple a small kernel like bzero after test if this was set to 0.
device_param->kernel_preferred_wgs_multiple = 0;
}
if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
{
cl_uint device_wavefront_width_amd;
@ -12023,6 +12036,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_bzero, &device_param->kernel_preferred_wgs_multiple_bzero) == -1) return -1;
// apple hack, but perhaps also an alternative for other vendors
if (device_param->kernel_preferred_wgs_multiple == 0) device_param->kernel_preferred_wgs_multiple = device_param->kernel_preferred_wgs_multiple_bzero;
// GPU autotune init
if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1;