Fix OpenCL spawning unnecessary work-item due to redundant multiplication in new OPTS_TYPE_THREAD_MULTI_DISABLE mode.

Prepare Metal section in run_kernel() for 2D kernel invocation related to new salt->salt_dimy variable. Move reusable Argon2 module code into separate file argon2_commit.c, similar to scrypt_commit.c, and update headers. Update existing hash mode 34000 to use argon2_commit.c.
2025-07-29 18:08:46 +00:00 · 2025-07-03 08:10:30 +02:00 · 2025-07-03 08:10:30 +02:00 · e8cf8bd146
commit e8cf8bd146
parent 2e185b5450
3 changed files with 221 additions and 171 deletions
--- a/src/backend.c
+++ b/src/backend.c
@ -2860,7 +2860,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con

    if (kernel_threads == 0) kernel_threads = 1;

-    num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    }

    if (kern_run == KERN_RUN_1)
    {
@ -2898,10 +2901,37 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      }
    }

-    num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    }
+    else
+    {
+      num_elements = num_elements * kernel_threads;
+    }

-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+    size_t global_work_size[3] = { num_elements,   1, 1 };
+    size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    cl_uint work_dim = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }

    double ms = 0;

@ -2997,7 +3027,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
    }
    */

-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    }

    cl_event opencl_event;

--- a/src/modules/argon2_common.c
+++ b/src/modules/argon2_common.c
@ -0,0 +1,174 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#include <inttypes.h>
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+#include "memory.h"
+
+#define ARGON2_SYNC_POINTS  4
+#define ARGON2_BLOCK_SIZE   1024
+
+typedef struct argon2_tmp
+{
+  u32 state[4]; // just something for now
+
+} argon2_tmp_t;
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+
+  u32 digest_len;
+
+} argon2_options_t;
+
+u32 argon2_module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = 32; // hard-coded in kernel
+
+  return kernel_threads_min;
+}
+
+u32 argon2_module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_max = 32; // hard-coded in kernel
+
+  return kernel_threads_max;
+}
+
+u64 argon2_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u64 tmp_size = 0; // we'll add some later
+
+  return tmp_size;
+}
+
+const char *argon2_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
+{
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  int   lines_sz  = 4096;
+  char *lines_buf = hcmalloc (lines_sz);
+  int   lines_pos = 0;
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u32 device_maxworkgroup_size = device_param->device_maxworkgroup_size;
+
+  const u64 fixed_mem = (256 * 1024 * 1024); // some storage we need for pws[], tmps[], and others. Is around 72MiB in reality.
+
+  const u64 spill_mem = 2048 * device_processors * device_maxworkgroup_size; // 1600 according to ptxas
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (fixed_mem + spill_mem);
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel_user)
+  {
+    kernel_accel_new = kernel_accel_user;
+  }
+  else
+  {
+    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
+    {
+      kernel_accel_new = available_mem / size_per_accel;
+
+      kernel_accel_new = MIN (kernel_accel_new, 1024); // 1024 = max supported
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
+  return lines_buf;
+}
+
+u64 argon2_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  const u64 size_argon2 = device_param->kernel_accel_max * size_per_accel;
+
+  return size_argon2;
+}
+
+u64 argon2_module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+  const u32 parallelism        = (options->parallelism)        ? options->parallelism        : options_st->parallelism;
+
+  for (u32 i = 1; i < hashes->salts_cnt; i++)
+  {
+    if ((memory_block_count != options[i].memory_block_count)
+     || (parallelism        != options[i].parallelism))
+    {
+      return (1ULL << 63) + i;
+    }
+  }
+
+  // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
+
+  if ((hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) == 0)
+  {
+    if ((memory_block_count != options_st->memory_block_count)
+     || (parallelism        != options_st->parallelism))
+    {
+      return (1ULL << 62);
+    }
+  }
+
+  u64 tmp_size = sizeof (argon2_tmp_t);
+
+  return tmp_size;
+}
+
+char *argon2_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
+
+  char *jit_build_options = NULL;
+
+  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%u -D ARGON2_TMP_ELEM=%u", options[0].parallelism, options[0].memory_block_count);
+
+  return jit_build_options;
+}
+
--- a/src/modules/module_34000.c
+++ b/src/modules/module_34000.c
@ -11,9 +11,6 @@
 #include "shared.h"
 #include "memory.h"

-#define ARGON2_SYNC_POINTS  4
-#define ARGON2_BLOCK_SIZE   1024
-
 static const u32   ATTACK_EXEC    = ATTACK_EXEC_OUTSIDE_KERNEL;
 static const u32   DGST_POS0      = 0;
 static const u32   DGST_POS1      = 1;
@ -48,28 +45,7 @@ u32         module_salt_type      (MAYBE_UNUSED const hashconfig_t *hashconfig,
 const char *module_st_hash        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH;         }
 const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS;         }

-typedef struct argon2_tmp
-{
-  u32 state[4]; // just something for now
-
-} argon2_tmp_t;
-
-typedef struct argon2_options
-{
-  u32 type;
-  u32 version;
-
-  u32 iterations;
-  u32 parallelism;
-  u32 memory_usage_in_kib;
-
-  u32 segment_length;
-  u32 lane_length;
-  u32 memory_block_count;
-
-  u32 digest_len;
-
-} argon2_options_t;
+#include "argon2_common.c"

 static const char *SIGNATURE_ARGON2D  = "$argon2d$";
 static const char *SIGNATURE_ARGON2I  = "$argon2i$";
@ -82,140 +58,6 @@ u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED
  return esalt_size;
 }

-u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_min = 32; // hard-coded in kernel
-
-  return kernel_threads_min;
-}
-
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_max = 32; // hard-coded in kernel
-
-  return kernel_threads_max;
-}
-
-u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u64 tmp_size = 0; // we'll add some later
-
-  return tmp_size;
-}
-
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
-{
-  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
-
-  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
-  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
-
-  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
-
-  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
-
-  int   lines_sz  = 4096;
-  char *lines_buf = hcmalloc (lines_sz);
-  int   lines_pos = 0;
-
-  const u32 device_processors = device_param->device_processors;
-
-  const u32 device_maxworkgroup_size = device_param->device_maxworkgroup_size;
-
-  const u64 fixed_mem = (256 * 1024 * 1024); // some storage we need for pws[], tmps[], and others. Is around 72MiB in reality.
-
-  const u64 spill_mem = 2048 * device_processors * device_maxworkgroup_size; // 1600 according to ptxas
-
-  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (fixed_mem + spill_mem);
-
-  u32 kernel_accel_new = device_processors;
-
-  if (kernel_accel_user)
-  {
-    kernel_accel_new = kernel_accel_user;
-  }
-  else
-  {
-    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
-    {
-      kernel_accel_new = available_mem / size_per_accel;
-
-      kernel_accel_new = MIN (kernel_accel_new, 1024); // 1024 = max supported
-    }
-  }
-
-  char *new_device_name = hcstrdup (device_param->device_name);
-
-  for (size_t i = 0; i < strlen (new_device_name); i++)
-  {
-    if (new_device_name[i] == ' ') new_device_name[i] = '_';
-  }
-
-  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
-
-  hcfree (new_device_name);
-
-  return lines_buf;
-}
-
-u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
-{
-  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
-  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
-
-  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
-
-  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
-
-  const u64 size_argon2 = device_param->kernel_accel_max * size_per_accel;
-
-  return size_argon2;
-}
-
-u64 module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
-{
-  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
-  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
-
-  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
-  const u32 parallelism        = (options->parallelism)        ? options->parallelism        : options_st->parallelism;
-
-  for (u32 i = 1; i < hashes->salts_cnt; i++)
-  {
-    if ((memory_block_count != options[i].memory_block_count)
-     || (parallelism        != options[i].parallelism))
-    {
-      return (1ULL << 63) + i;
-    }
-  }
-
-  // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
-
-  if ((hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) == 0)
-  {
-    if ((memory_block_count != options_st->memory_block_count)
-     || (parallelism        != options_st->parallelism))
-    {
-      return (1ULL << 62);
-    }
-  }
-
-  u64 tmp_size = sizeof (argon2_tmp_t);
-
-  return tmp_size;
-}
-
-char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
-{
-  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
-
-  char *jit_build_options = NULL;
-
-  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%u -D ARGON2_TMP_ELEM=%u", options[0].parallelism, options[0].memory_block_count);
-
-  return jit_build_options;
-}
-
 int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
 {
  u32 *digest = (u32 *) digest_buf;
@ -393,9 +235,9 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_dgst_size                = module_dgst_size;
  module_ctx->module_dictstat_disable         = MODULE_DEFAULT;
  module_ctx->module_esalt_size               = module_esalt_size;
-  module_ctx->module_extra_buffer_size        = module_extra_buffer_size;
-  module_ctx->module_extra_tmp_size           = module_extra_tmp_size;
-  module_ctx->module_extra_tuningdb_block     = module_extra_tuningdb_block;
+  module_ctx->module_extra_buffer_size        = argon2_module_extra_buffer_size;
+  module_ctx->module_extra_tmp_size           = argon2_module_extra_tmp_size;
+  module_ctx->module_extra_tuningdb_block     = argon2_module_extra_tuningdb_block;
  module_ctx->module_forced_outfile_format    = MODULE_DEFAULT;
  module_ctx->module_hash_binary_count        = MODULE_DEFAULT;
  module_ctx->module_hash_binary_parse        = MODULE_DEFAULT;
@ -421,14 +263,14 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_hook23                   = MODULE_DEFAULT;
  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
  module_ctx->module_hook_size                = MODULE_DEFAULT;
-  module_ctx->module_jit_build_options        = module_jit_build_options;
+  module_ctx->module_jit_build_options        = argon2_module_jit_build_options;
  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
+  module_ctx->module_kernel_threads_max       = argon2_module_kernel_threads_max;
+  module_ctx->module_kernel_threads_min       = argon2_module_kernel_threads_min;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
@ -447,7 +289,8 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_separator                = MODULE_DEFAULT;
  module_ctx->module_st_hash                  = module_st_hash;
  module_ctx->module_st_pass                  = module_st_pass;
-  module_ctx->module_tmp_size                 = module_tmp_size;
+  module_ctx->module_tmp_size                 = argon2_module_tmp_size;
  module_ctx->module_unstable_warning         = MODULE_DEFAULT;
  module_ctx->module_warmup_disable           = MODULE_DEFAULT;
 }
+