From fd2cb59d26082e1641c3f21118db7ab14a5e4930 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Tue, 27 Jul 2021 09:37:31 +0200
Subject: [PATCH] AMD GPUs: On Apple OpenCL platform, we ask for the preferred
 kernel thread size rather than hard-coding 32 ECC secp256k1: Removed the
 inline assembly code for AMD GPUs because the latest JIT compilers optimize
 it with the same efficiency

---
 OpenCL/inc_ecc_secp256k1.cl |  8 ++++++--
 src/backend.c               | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/OpenCL/inc_ecc_secp256k1.cl b/OpenCL/inc_ecc_secp256k1.cl
index b3a70df78..a487152ec 100644
--- a/OpenCL/inc_ecc_secp256k1.cl
+++ b/OpenCL/inc_ecc_secp256k1.cl
@@ -124,7 +124,9 @@ DECLSPEC u32 sub (u32 *r, const u32 *a, const u32 *b)
     :  "r"(a[0]),  "r"(a[1]),  "r"(a[2]),  "r"(a[3]),  "r"(a[4]),  "r"(a[5]),  "r"(a[6]),  "r"(a[7]),
        "r"(b[0]),  "r"(b[1]),  "r"(b[2]),  "r"(b[3]),  "r"(b[4]),  "r"(b[5]),  "r"(b[6]),  "r"(b[7])
   );
-  #elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
+  // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
+  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
+  #elif 0
   __asm__ __volatile__
   (
     "V_SUB_U32   %0,  %9, %17;"
@@ -176,7 +178,9 @@ DECLSPEC u32 add (u32 *r, const u32 *a, const u32 *b)
     :  "r"(a[0]),  "r"(a[1]),  "r"(a[2]),  "r"(a[3]),  "r"(a[4]),  "r"(a[5]),  "r"(a[6]),  "r"(a[7]),
        "r"(b[0]),  "r"(b[1]),  "r"(b[2]),  "r"(b[3]),  "r"(b[4]),  "r"(b[5]),  "r"(b[6]),  "r"(b[7])
   );
-  #elif (defined IS_AMD || defined IS_HIP) && HAS_VADD == 1 && HAS_VADDC == 1
+  // HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
+  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
+  #elif 0
   __asm__ __volatile__
   (
     "V_ADD_U32   %0,  %9, %17;"
diff --git a/src/backend.c b/src/backend.c
index d8d213bc8..aa28e02d5 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -9366,6 +9366,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
         {
+          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
+          {
+            // from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_device_attribute_query.txt
+            #define CL_DEVICE_WAVEFRONT_WIDTH_AMD                   0x4043
+
+            // crazy, but apple does not support this query!
+            // the best alternative is "Preferred work group size multiple (kernel)", but requires to specify a kernel.
+            // so we will set kernel_preferred_wgs_multiple intentionally to 0 because otherwise it it set to 8 by default.
+            // we then assign the value kernel_preferred_wgs_multiple a small kernel like bzero after test if this was set to 0.
+
+            device_param->kernel_preferred_wgs_multiple = 0;
+          }
+
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
           {
             cl_uint device_wavefront_width_amd;
@@ -12023,6 +12036,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
         if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_bzero, &device_param->kernel_preferred_wgs_multiple_bzero) == -1) return -1;
 
+        // apple hack, but perhaps also an alternative for other vendors
+
+        if (device_param->kernel_preferred_wgs_multiple == 0) device_param->kernel_preferred_wgs_multiple = device_param->kernel_preferred_wgs_multiple_bzero;
+
         // GPU autotune init
 
         if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1;