From fd2cb59d26082e1641c3f21118db7ab14a5e4930 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Tue, 27 Jul 2021 09:37:31 +0200 Subject: [PATCH] AMD GPUs: On Apple OpenCL platform, we ask for the preferred kernel thread size rather than hard-coding 32 ECC secp256k1: Removed the inline assembly code for AMD GPUs because the latest JIT compilers optimize it with the same efficiency --- OpenCL/inc_ecc_secp256k1.cl | 8 ++++++-- src/backend.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/OpenCL/inc_ecc_secp256k1.cl b/OpenCL/inc_ecc_secp256k1.cl index b3a70df78..a487152ec 100644 --- a/OpenCL/inc_ecc_secp256k1.cl +++ b/OpenCL/inc_ecc_secp256k1.cl @@ -124,7 +124,9 @@ DECLSPEC u32 sub (u32 *r, const u32 *a, const u32 *b) : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]), "r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]) ); - #elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1 + // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm + //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1 + #elif 0 __asm__ __volatile__ ( "V_SUB_U32 %0, %9, %17;" @@ -176,7 +178,9 @@ DECLSPEC u32 add (u32 *r, const u32 *a, const u32 *b) : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]), "r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]) ); - #elif (defined IS_AMD || defined IS_HIP) && HAS_VADD == 1 && HAS_VADDC == 1 + // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm + //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1 + #elif 0 __asm__ __volatile__ ( "V_ADD_U32 %0, %9, %17;" diff --git a/src/backend.c b/src/backend.c index d8d213bc8..aa28e02d5 100644 --- a/src/backend.c +++ b/src/backend.c @@ -9366,6 +9366,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { + if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)) + { + // from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_device_attribute_query.txt + #define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 + + // crazy, but apple does not support this query! + // the best alternative is "Preferred work group size multiple (kernel)", but requires to specify a kernel. + // so we will set kernel_preferred_wgs_multiple intentionally to 0 because otherwise it it set to 8 by default. + // we then assign the value kernel_preferred_wgs_multiple a small kernel like bzero after test if this was set to 0. + + device_param->kernel_preferred_wgs_multiple = 0; + } + if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)) { cl_uint device_wavefront_width_amd; @@ -12023,6 +12036,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_bzero, &device_param->kernel_preferred_wgs_multiple_bzero) == -1) return -1; + // apple hack, but perhaps also an alternative for other vendors + + if (device_param->kernel_preferred_wgs_multiple == 0) device_param->kernel_preferred_wgs_multiple = device_param->kernel_preferred_wgs_multiple_bzero; + // GPU autotune init if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1;