Metal Backend:

- added support to 2D/3D Compute - improved compute workloads calculation Makefile: - updated MACOSX_DEPLOYMENT_TARGET to 15.0 Unit tests: - updated install_modules.sh with Crypt::Argon2 Argon2 start works with Apple Metal
2025-07-07 15:18:15 +00:00 · 2025-07-03 22:06:32 +02:00 · 2025-07-03 22:06:32 +02:00 · bcc351068f
commit bcc351068f
parent b9b20c3340
11 changed files with 93 additions and 41 deletions
--- a/OpenCL/inc_amp.h
+++ b/OpenCL/inc_amp.h
@ -16,7 +16,7 @@
  GLOBAL_AS   const bf_t          *bfs_buf,    \
  CONSTANT_AS const u32           &combs_mode, \
  CONSTANT_AS const u64           &gid_max,    \
-                    uint           hc_gid [[ thread_position_in_grid ]]
+                    uint3          hc_gid [[ thread_position_in_grid ]]

 #else // CUDA, HIP, OpenCL

--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@ -124,10 +124,10 @@

 #if defined IS_METAL
 #define KERN_ATTR_MAIN_PARAMS                       \
-  uint hc_gid [[ thread_position_in_grid ]],        \
-  uint hc_lid [[ thread_position_in_threadgroup ]], \
-  uint hc_lsz [[ threads_per_threadgroup ]],        \
-  uint hc_bid [[ threadgroup_position_in_grid ]]
+  uint3 hc_gid [[ thread_position_in_grid ]],        \
+  uint3 hc_lid [[ thread_position_in_threadgroup ]], \
+  uint3 hc_lsz [[ threads_per_threadgroup ]],        \
+  uint3 hc_bid [[ threadgroup_position_in_grid ]]
 #endif // IS_METAL

 /*
--- a/OpenCL/inc_markov.h
+++ b/OpenCL/inc_markov.h
@ -19,7 +19,7 @@
  CONSTANT_AS const u32  &bits14,         \
  CONSTANT_AS const u32  &bits15,         \
  CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_R_MARKOV                \
  GLOBAL_AS         bf_t *pws_buf_r,      \
@ -31,7 +31,7 @@
  CONSTANT_AS const u32  &bits14,         \
  CONSTANT_AS const u32  &bits15,         \
  CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_C_MARKOV                \
  GLOBAL_AS         pw_t *pws_buf,        \
@ -43,7 +43,7 @@
  CONSTANT_AS const u32  &bits14,         \
  CONSTANT_AS const u32  &bits15,         \
  CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]

 #else // CUDA, HIP, OpenCL

--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@ -73,10 +73,25 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);

-#define get_global_id(param) hc_gid
-#define get_local_id(param) hc_lid
-#define get_local_size(param) hc_lsz
-#define get_group_id(param) hc_bid
+#define get_global_id(dimindx)   \
+  ((dimindx) == 0 ? hc_gid.x :   \
+   (dimindx) == 1 ? hc_gid.y :   \
+   (dimindx) == 2 ? hc_gid.z : -1)
+
+#define get_group_id(dimindx)    \
+  ((dimindx) == 0 ? hc_bid.x :   \
+   (dimindx) == 1 ? hc_bid.y :   \
+   (dimindx) == 2 ? hc_bid.z : -1)
+
+#define get_local_id(dimindx)    \
+  ((dimindx) == 0 ? hc_lid.x :   \
+   (dimindx) == 1 ? hc_lid.y :   \
+   (dimindx) == 2 ? hc_lid.z : -1)
+
+#define get_local_size(dimindx)  \
+  ((dimindx) == 0 ? hc_lsz.x :   \
+   (dimindx) == 1 ? hc_lsz.y :   \
+   (dimindx) == 2 ? hc_lsz.z : -1)

 DECLSPEC u32x rotl32   (const u32x a, const int n);
 DECLSPEC u32x rotr32   (const u32x a, const int n);
--- a/OpenCL/inc_shared.h
+++ b/OpenCL/inc_shared.h
@ -13,28 +13,28 @@
  GLOBAL_AS         u32      *pws_comp, \
  GLOBAL_AS         pw_t     *pws_buf,  \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_MEMSET            \
  GLOBAL_AS         uint4    *buf,      \
  CONSTANT_AS const u32      &value,    \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_BZERO             \
  GLOBAL_AS         uint4    *buf,      \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_ATINIT            \
  GLOBAL_AS         pw_t     *buf,      \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_UTF8_TO_UTF16     \
  GLOBAL_AS         pw_t     *pws_buf,  \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #else // CUDA, HIP, OpenCL

--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -148,19 +148,23 @@
 - Status Code: Add specific return code for self-test fail (-11)
 - Scrypt: Increase buffer sizes in module for hash mode 8900 to allow longer scrypt digests
 - Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629
+- Unit tests: Updated install_modules.sh with Crypt::Argon2
 - User Options: Added error message when mixing --username and --show to warn users of exponential delay
 - MetaMask: update extraction tool to support MetaMask Mobile wallets
 - SecureCRT MasterPassphrase v2: update module, pure kernels and test unit. Add optimized kernels.
 - Metal Backend: added workaround to prevent 'Infinite Loop' bug when build kernels
 - Metal Backend: added workaround to set the true Processor value in Metal devices on Apple Intel
+- Metal Backend: added support to 2D/3D Compute
 - Metal Backend: allow use of devices with Metal if runtime version is >= 200
 - Metal Backend: disable Metal devices only if at least one OpenCL device is active
+- Metal Backend: improved compute workloads calculation
 - Modules: Check UnpackSize to raise false positive with hc_decompress_rar
 - User Options: added --metal-compiler-runtime option
 - Hardware Monitor: avoid sprintf in src/ext_iokit.c
 - Hardware Monitor: Splitting hwmon_ctx_init function into smaller library-specific functions
 - Help: show supported hash-modes only with -hh
 - Makefile: prevent make failure with Apple Silicon in case of partial rebuild
+- Makefile: updated MACOSX_DEPLOYMENT_TARGET to 15.0
 - Rules: Rename best64.rule to best66.rule and remove the unknown section from it

 * changes v6.2.5 -> v6.2.6
--- a/include/ext_metal.h
+++ b/include/ext_metal.h
@ -111,7 +111,7 @@ int  hc_mtlCreateLibraryWithFile    (void *hashcat_ctx, mtl_device_id metal_devi
 int  hc_mtlEncodeComputeCommand_pre (void *hashcat_ctx, mtl_pipeline metal_pipeline, mtl_command_queue metal_command_queue, mtl_command_buffer *metal_command_buffer, mtl_command_encoder *metal_command_encoder);
 int  hc_mtlSetCommandEncoderArg     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, size_t off, size_t idx, mtl_mem buf, void *host_data, size_t host_data_size);

-int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const size_t global_work_size[3], const size_t local_work_size[3], double *ms);
+int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms);

 #endif // __APPLE__

--- a/src/Makefile
+++ b/src/Makefile
@ -358,7 +358,7 @@ LFLAGS_NATIVE           += -lpthread
 endif # NetBSD

 ifeq ($(UNAME),Darwin)
-export MACOSX_DEPLOYMENT_TARGET=10.15
+export MACOSX_DEPLOYMENT_TARGET=15.0
 CFLAGS_NATIVE           := $(CFLAGS)
 CFLAGS_NATIVE           += -DWITH_HWMON

--- a/src/backend.c
+++ b/src/backend.c
@ -2206,7 +2206,7 @@ int run_metal_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi

  double ms = 0;

-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;

  return 0;
 }
@ -2234,7 +2234,7 @@ int run_metal_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_

  double ms = 0;

-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;

  return 0;
 }
@ -2265,7 +2265,7 @@ int run_metal_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic

    double ms = 0;

-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
  }

  if (num16m)
@ -2910,29 +2910,34 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      num_elements = num_elements * kernel_threads;
    }

+    unsigned int work_dim = 1;
+
    size_t global_work_size[3] = { num_elements,   1, 1 };
    size_t local_work_size[3]  = { kernel_threads, 1, 1 };

    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
    {
      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
    }

    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
    {
      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
    }

    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
    {
      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
    }

    double ms = 0;

    if (is_autotune == true)
    {
-      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
+      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, work_dim, global_work_size, local_work_size, &ms);

      // hc_mtlEncodeComputeCommand_pre() must be called before every hc_mtlEncodeComputeCommand()
      if (hc_mtlEncodeComputeCommand_pre (hashcat_ctx, metal_pipeline, device_param->metal_command_queue, &metal_command_buffer, &metal_command_encoder) == -1) return -1;
@ -2951,7 +2956,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      }
    }

-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, work_dim, global_work_size, local_work_size, &ms);

    if (rc_cc != -1)
    {
@ -3344,7 +3349,7 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,

    double ms = 0;

-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
  }
  #endif // __APPLE__

@ -3435,7 +3440,7 @@ int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)

    double ms = 0;

-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
  }
  #endif // __APPLE__

@ -3519,7 +3524,7 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,

    double ms = 0;

-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms);

    // release tmp_buf

@ -3599,7 +3604,7 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device

    double ms = 0;

-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
  }
  #endif // __APPLE__

--- a/src/ext_metal.m
+++ b/src/ext_metal.m
@ -195,11 +195,14 @@ static int hc_mtlBuildOptionsToDict (void *hashcat_ctx, const char *build_option
  }

  // if set, add INCLUDE_PATH to hack Apple kernel build from source limitation on -I usage
+
  if (include_path != nil)
  {
    NSString *path_key = @"INCLUDE_PATH";
    NSString *path_value = [NSString stringWithCString: include_path encoding: NSUTF8StringEncoding];
+
    // Include path may contain spaces, escape them with a backslash
+
    path_value = [path_value stringByReplacingOccurrencesOfString:@" " withString:@"\\ "];

    [build_options_dict setObject:path_value forKey:path_key];
@ -743,6 +746,7 @@ int hc_mtlCreateKernel (void *hashcat_ctx, mtl_device_id metal_device, mtl_libra
  dispatch_queue_t queue = dispatch_get_global_queue (DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);

  // if no user-defined runtime, set to METAL_COMPILER_RUNTIME
+
  long timeout = (user_options->metal_compiler_runtime > 0) ? user_options->metal_compiler_runtime : METAL_COMPILER_RUNTIME;

  dispatch_time_t when = dispatch_time (DISPATCH_TIME_NOW,NSEC_PER_SEC * timeout);
@ -1314,10 +1318,21 @@ int hc_mtlSetCommandEncoderArg (void *hashcat_ctx, mtl_command_encoder metal_com
  return 0;
 }

-int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const size_t global_work_size[3], const size_t local_work_size[3], double *ms)
+int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms)
 {
-  MTLSize numThreadgroups = {local_work_size[0], local_work_size[1], local_work_size[2]};
-  MTLSize threadsGroup = {global_work_size[0], global_work_size[1], global_work_size[2]};
+  MTLSize threadsPerThreadgroup =
+  {
+    local_work_size[0],
+    local_work_size[1],
+    local_work_size[2]
+  };
+
+  MTLSize threadgroupsPerGrid =
+  {
+    (global_work_size[0] + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width,
+    work_dim > 1 ? (global_work_size[1] + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height : 1,
+    work_dim > 2 ? (global_work_size[2] + threadsPerThreadgroup.depth - 1) / threadsPerThreadgroup.depth : 1
+  };

  if (metal_command_encoder == nil)
  {
@ -1333,7 +1348,7 @@ int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_com
    return -1;
  }

-  [metal_command_encoder dispatchThreadgroups: threadsGroup threadsPerThreadgroup: numThreadgroups];
+  [metal_command_encoder dispatchThreadgroups: threadgroupsPerGrid threadsPerThreadgroup: threadsPerThreadgroup];

  [metal_command_encoder endEncoding];
  [metal_command_buffer commit];
@ -1377,7 +1392,11 @@ int hc_mtlCreateLibraryWithFile (void *hashcat_ctx, mtl_device_id metal_device,

  if (k_string != nil)
  {
-    id <MTLLibrary> r = [metal_device newLibraryWithFile: k_string error: &error];
+    NSURL *libURL = [NSURL fileURLWithPath: k_string];
+
+    if (libURL != nil)
+    {
+      id <MTLLibrary> r = [metal_device newLibraryWithURL: libURL error:&error];

      if (error != nil)
      {
@ -1389,6 +1408,7 @@ int hc_mtlCreateLibraryWithFile (void *hashcat_ctx, mtl_device_id metal_device,

      return 0;
    }
+  }

  return -1;
 }
@ -1420,10 +1440,17 @@ int hc_mtlCreateLibraryWithSource (void *hashcat_ctx, mtl_device_id metal_device
      }

      compileOptions.preprocessorMacros = build_options_dict;
+      /*
+      compileOptions.optimizationLevel = MTLLibraryOptimizationLevelSize;
+      compileOptions.mathMode = MTLMathModeSafe;
+      // compileOptions.mathMode = MTLMathModeRelaxed;
+      // compileOptions.enableLogging = true;
+      // compileOptions.fastMathEnabled = false;
+      */
    }

    // todo: detect current os version and choose the right
-//    compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
+    // compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
 /*
    if (@available(macOS 12.0, *))
    {
--- a/tools/install_modules.sh
+++ b/tools/install_modules.sh
@ -18,6 +18,7 @@ cpan install Authen::Passphrase::LANManager \
             Bitcoin::Crypto::Base58        \
             Compress::Zlib                 \
             Convert::EBCDIC                \
+             Crypt::Argon2                  \
             Crypt::AuthEnc::GCM            \
             Crypt::Camellia                \
             Crypt::CBC                     \