Apply previous blowfish optimization for -m 3200 also on -m 9000 and -m 18600

2025-05-21 08:18:48 +00:00 · 2019-03-18 12:42:47 +01:00 · 2019-03-18 12:42:47 +01:00 · 5ef67a8ab7
commit 5ef67a8ab7
parent a172ab7d8a
10 changed files with 370 additions and 125 deletions
--- a/OpenCL/inc_types.cl
+++ b/OpenCL/inc_types.cl
@ -39,6 +39,60 @@ typedef VTYPE(uint,   VECT_SIZE) u32x;
 typedef VTYPE(ulong,  VECT_SIZE) u64x;
 #endif

+// unions
+
+typedef union vconv32
+{
+  u64 v32;
+
+  struct
+  {
+    u16 v16a;
+    u16 v16b;
+  };
+
+  struct
+  {
+    u8 v8a;
+    u8 v8b;
+    u8 v8c;
+    u8 v8d;
+  };
+
+} vconv32_t;
+
+typedef union vconv64
+{
+  u64 v64;
+
+  struct
+  {
+    u32 v32a;
+    u32 v32b;
+  };
+
+  struct
+  {
+    u16 v16a;
+    u16 v16b;
+    u16 v16c;
+    u16 v16d;
+  };
+
+  struct
+  {
+    u8 v8a;
+    u8 v8b;
+    u8 v8c;
+    u8 v8d;
+    u8 v8e;
+    u8 v8f;
+    u8 v8g;
+    u8 v8h;
+  };
+
+} vconv64_t;
+
 DECLSPEC u32 l32_from_64_S (u64 a);
 DECLSPEC u32 l32_from_64_S (u64 a)
 {
@ -1426,3 +1480,189 @@ typedef struct keyboard_layout_mapping
  int dst_len;

 } keyboard_layout_mapping_t;
+
+// functions
+
+DECLSPEC u8 v8a_from_v32_S (const u32 v32);
+DECLSPEC u8 v8a_from_v32_S (const u32 v32)
+{
+  vconv32_t v;
+
+  v.v32 = v32;
+
+  return v.v8a;
+}
+
+DECLSPEC u8 v8b_from_v32_S (const u32 v32);
+DECLSPEC u8 v8b_from_v32_S (const u32 v32)
+{
+  vconv32_t v;
+
+  v.v32 = v32;
+
+  return v.v8b;
+}
+
+DECLSPEC u8 v8c_from_v32_S (const u32 v32);
+DECLSPEC u8 v8c_from_v32_S (const u32 v32)
+{
+  vconv32_t v;
+
+  v.v32 = v32;
+
+  return v.v8c;
+}
+
+DECLSPEC u8 v8d_from_v32_S (const u32 v32);
+DECLSPEC u8 v8d_from_v32_S (const u32 v32)
+{
+  vconv32_t v;
+
+  v.v32 = v32;
+
+  return v.v8d;
+}
+
+DECLSPEC u16 v16a_from_v32_S (const u32 v32);
+DECLSPEC u16 v16a_from_v32_S (const u32 v32)
+{
+  vconv32_t v;
+
+  v.v32 = v32;
+
+  return v.v16a;
+}
+
+DECLSPEC u16 v16b_from_v32_S (const u32 v32);
+DECLSPEC u16 v16b_from_v32_S (const u32 v32)
+{
+  vconv32_t v;
+
+  v.v32 = v32;
+
+  return v.v16b;
+}
+
+DECLSPEC u32 v32_from_v16ab_S (const u16 v16a, const u16 v16b);
+DECLSPEC u32 v32_from_v16ab_S (const u16 v16a, const u16 v16b)
+{
+  vconv32_t v;
+
+  v.v16a = v16a;
+  v.v16b = v16b;
+
+  return v.v32;
+}
+
+DECLSPEC u32 v32a_from_v64_S (const u64 v64);
+DECLSPEC u32 v32a_from_v64_S (const u64 v64)
+{
+  vconv64_t v;
+
+  v.v64 = v64;
+
+  return v.v32a;
+}
+
+DECLSPEC u32 v32b_from_v64_S (const u64 v64);
+DECLSPEC u32 v32b_from_v64_S (const u64 v64)
+{
+  vconv64_t v;
+
+  v.v64 = v64;
+
+  return v.v32b;
+}
+
+DECLSPEC u64 v64_from_v32ab_S (const u32 v32a, const u32 v32b);
+DECLSPEC u64 v64_from_v32ab_S (const u32 v32a, const u32 v32b)
+{
+  vconv64_t v;
+
+  v.v32a = v32a;
+  v.v32b = v32b;
+
+  return v.v64;
+}
+
+// unpack function are similar, but always return u32
+
+DECLSPEC u32 unpack_v8a_from_v32_S (const u32 v32);
+DECLSPEC u32 unpack_v8a_from_v32_S (const u32 v32)
+{
+  u32 r;
+
+  #if defined IS_NV
+  asm ("bfe.u32 %0, %1, 0, 8;" : "=r"(r) : "r"(v32));
+  #elif defined IS_AMD
+    #ifdef HAS_VBFE
+    __asm__ ("V_BFE_U32 %0, %1, 0, 8;" : "=v"(r) : "v"(v32));
+    #else
+    r = (v32 >> 0) & 0xff;
+    #endif
+  #else
+  r = (v32 >> 0) & 0xff;
+  #endif
+
+  return r;
+}
+
+DECLSPEC u32 unpack_v8b_from_v32_S (const u32 v32);
+DECLSPEC u32 unpack_v8b_from_v32_S (const u32 v32)
+{
+  u32 r;
+
+  #if defined IS_NV
+  asm ("bfe.u32 %0, %1, 8, 8;" : "=r"(r) : "r"(v32));
+  #elif defined IS_AMD
+    #ifdef HAS_VBFE
+    __asm__ ("V_BFE_U32 %0, %1, 8, 8;" : "=v"(r) : "v"(v32));
+    #else
+    r = (v32 >> 8) & 0xff;
+    #endif
+  #else
+  r = (v32 >> 8) & 0xff;
+  #endif
+
+  return r;
+}
+
+DECLSPEC u32 unpack_v8c_from_v32_S (const u32 v32);
+DECLSPEC u32 unpack_v8c_from_v32_S (const u32 v32)
+{
+  u32 r;
+
+  #if defined IS_NV
+  asm ("bfe.u32 %0, %1, 16, 8;" : "=r"(r) : "r"(v32));
+  #elif defined IS_AMD
+    #ifdef HAS_VBFE
+    __asm__ ("V_BFE_U32 %0, %1, 16, 8;" : "=v"(r) : "v"(v32));
+    #else
+    r = (v32 >> 16) & 0xff;
+    #endif
+  #else
+  r = (v32 >> 16) & 0xff;
+  #endif
+
+  return r;
+}
+
+DECLSPEC u32 unpack_v8d_from_v32_S (const u32 v32);
+DECLSPEC u32 unpack_v8d_from_v32_S (const u32 v32)
+{
+  u32 r;
+
+  #if defined IS_NV
+  asm ("bfe.u32 %0, %1, 24, 8;" : "=r"(r) : "r"(v32));
+  #elif defined IS_AMD
+    #ifdef HAS_VBFE
+    __asm__ ("V_BFE_U32 %0, %1, 24, 8;" : "=v"(r) : "v"(v32));
+    #else
+    r = (v32 >> 24) & 0xff;
+    #endif
+  #else
+  r = (v32 >> 24) & 0xff;
+  #endif
+
+  return r;
+}
--- a/OpenCL/m03200-pure.cl
+++ b/OpenCL/m03200-pure.cl
@ -299,21 +299,21 @@ __constant u32a c_sbox3[256] =
  0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6
 };

-#define BF_ROUND(L,R,N)                 \
-{                                       \
-  u32 tmp;                              \
-                                        \
-  const u32 r0 = hc_bfe_S ((L), 24, 8); \
-  const u32 r1 = hc_bfe_S ((L), 16, 8); \
-  const u32 r2 = hc_bfe_S ((L),  8, 8); \
-  const u32 r3 = hc_bfe_S ((L),  0, 8); \
-                                        \
-  tmp  = S0[r0];                        \
-  tmp += S1[r1];                        \
-  tmp ^= S2[r2];                        \
-  tmp += S3[r3];                        \
-                                        \
-  (R) ^= tmp ^ P[(N)];                  \
+#define BF_ROUND(L,R,N)                       \
+{                                             \
+  u32 tmp;                                    \
+                                              \
+  const u32 r0 = unpack_v8d_from_v32_S ((L)); \
+  const u32 r1 = unpack_v8c_from_v32_S ((L)); \
+  const u32 r2 = unpack_v8b_from_v32_S ((L)); \
+  const u32 r3 = unpack_v8a_from_v32_S ((L)); \
+                                              \
+  tmp  = S0[r0];                              \
+  tmp += S1[r1];                              \
+  tmp ^= S2[r2];                              \
+  tmp += S3[r3];                              \
+                                              \
+  (R) ^= tmp ^ P[(N)];                        \
 }

 #define BF_ENCRYPT(L,R) \
@ -658,9 +658,6 @@ __kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m032
    L0 = 0;
    R0 = 0;

-    #ifdef _unroll
-    #pragma unroll
-    #endif
    for (u32 i = 0; i < 9; i++)
    {
      BF_ENCRYPT (L0, R0);
@ -723,9 +720,6 @@ __kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m032
    L0 = 0;
    R0 = 0;

-    #ifdef _unroll
-    #pragma unroll
-    #endif
    for (u32 i = 0; i < 9; i++)
    {
      BF_ENCRYPT (L0, R0);
--- a/OpenCL/m09000-pure.cl
+++ b/OpenCL/m09000-pure.cl
@ -308,51 +308,22 @@ __constant u32a c_pbox[18] =
  0x9216d5d9, 0x8979fb1b
 };

-#ifdef IS_AMD
-#define BF_ROUND(L,R,N)       \
-{                             \
-  uchar4 c = as_uchar4 ((L)); \
-                              \
-  u32 tmp;                    \
-                              \
-  tmp  = S0[c.s3];            \
-  tmp += S1[c.s2];            \
-  tmp ^= S2[c.s1];            \
-  tmp += S3[c.s0];            \
-                              \
-  (R) ^= tmp ^ P[(N)];        \
+#define BF_ROUND(L,R,N)                       \
+{                                             \
+  u32 tmp;                                    \
+                                              \
+  const u32 r0 = unpack_v8d_from_v32_S ((L)); \
+  const u32 r1 = unpack_v8c_from_v32_S ((L)); \
+  const u32 r2 = unpack_v8b_from_v32_S ((L)); \
+  const u32 r3 = unpack_v8a_from_v32_S ((L)); \
+                                              \
+  tmp  = S0[r0];                              \
+  tmp += S1[r1];                              \
+  tmp ^= S2[r2];                              \
+  tmp += S3[r3];                              \
+                                              \
+  (R) ^= tmp ^ P[(N)];                        \
 }
-#endif
-
-#ifdef IS_NV
-#define BF_ROUND(L,R,N)             \
-{                                   \
-  u32 tmp;                          \
-                                    \
-  tmp  = S0[hc_bfe_S ((L), 24, 8)];  \
-  tmp += S1[hc_bfe_S ((L), 16, 8)];  \
-  tmp ^= S2[hc_bfe_S ((L),  8, 8)];  \
-  tmp += S3[hc_bfe_S ((L),  0, 8)];  \
-                                    \
-  (R) ^= tmp ^ P[(N)];              \
-}
-#endif
-
-#ifdef IS_GENERIC
-#define BF_ROUND(L,R,N)       \
-{                             \
-  uchar4 c = as_uchar4 ((L)); \
-                              \
-  u32 tmp;                    \
-                              \
-  tmp  = S0[c.s3];            \
-  tmp += S1[c.s2];            \
-  tmp ^= S2[c.s1];            \
-  tmp += S3[c.s0];            \
-                              \
-  (R) ^= tmp ^ P[(N)];        \
-}
-#endif

 #define BF_ENCRYPT(L,R) \
 {                       \
@ -512,7 +483,7 @@ DECLSPEC void sha1_transform (const u32 *w0, const u32 *w1, const u32 *w2, const
  digest[4] += E;
 }

-__kernel void m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
+__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
 {
  /**
   * base
@ -626,10 +597,10 @@ __kernel void m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
    P[i] = c_pbox[i];
  }

-  __local u32 S0_all[8][256];
-  __local u32 S1_all[8][256];
-  __local u32 S2_all[8][256];
-  __local u32 S3_all[8][256];
+  __local u32 S0_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S1_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S2_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S3_all[FIXED_LOCAL_SIZE][256];

  __local u32 *S0 = S0_all[lid];
  __local u32 *S1 = S1_all[lid];
@ -731,7 +702,7 @@ __kernel void m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
  }
 }

-__kernel void m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t))
+__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t))
 {
  /**
   * base
@ -752,23 +723,21 @@ __kernel void m09000_loop (KERN_ATTR_TMPS (pwsafe2_tmp_t))

  u32 P[18];

-  #pragma unroll
  for (u32 i = 0; i < 18; i++)
  {
    P[i] = tmps[gid].P[i];
  }

-  __local u32 S0_all[8][256];
-  __local u32 S1_all[8][256];
-  __local u32 S2_all[8][256];
-  __local u32 S3_all[8][256];
+  __local u32 S0_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S1_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S2_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S3_all[FIXED_LOCAL_SIZE][256];

  __local u32 *S0 = S0_all[lid];
  __local u32 *S1 = S1_all[lid];
  __local u32 *S2 = S2_all[lid];
  __local u32 *S3 = S3_all[lid];

-  #pragma unroll
  for (u32 i = 0; i < 256; i++)
  {
    S0[i] = tmps[gid].S0[i];
--- a/OpenCL/m18600-pure.cl
+++ b/OpenCL/m18600-pure.cl
@ -318,18 +318,21 @@ __constant u32a c_pbox[18] =
  0x9216d5d9, 0x8979fb1b
 };

-#define BF_ROUND(L,R,N)       \
-{                             \
-  uchar4 c = as_uchar4 ((L)); \
-                              \
-  u32 tmp;                    \
-                              \
-  tmp  = S0[c.s3];            \
-  tmp += S1[c.s2];            \
-  tmp ^= S2[c.s1];            \
-  tmp += S3[c.s0];            \
-                              \
-  (R) ^= tmp ^ P[(N)];        \
+#define BF_ROUND(L,R,N)                       \
+{                                             \
+  u32 tmp;                                    \
+                                              \
+  const u32 r0 = unpack_v8d_from_v32_S ((L)); \
+  const u32 r1 = unpack_v8c_from_v32_S ((L)); \
+  const u32 r2 = unpack_v8b_from_v32_S ((L)); \
+  const u32 r3 = unpack_v8a_from_v32_S ((L)); \
+                                              \
+  tmp  = S0[r0];                              \
+  tmp += S1[r1];                              \
+  tmp ^= S2[r2];                              \
+  tmp += S3[r3];                              \
+                                              \
+  (R) ^= tmp ^ P[(N)];                        \
 }

 #define BF_ENCRYPT(L,R) \
@ -582,7 +585,7 @@ __kernel void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
  }
 }

-__kernel void m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
+__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
 {
  const u64 gid = get_global_id (0);
  const u64 lid = get_local_id (0);
@ -612,10 +615,10 @@ __kernel void m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
    P[i] = c_pbox[i] ^ ukey[i % 4];
  }

-  __local u32 S0_all[8][256];
-  __local u32 S1_all[8][256];
-  __local u32 S2_all[8][256];
-  __local u32 S3_all[8][256];
+  __local u32 S0_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S1_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S2_all[FIXED_LOCAL_SIZE][256];
+  __local u32 S3_all[FIXED_LOCAL_SIZE][256];

  __local u32 *S0 = S0_all[lid];
  __local u32 *S1 = S1_all[lid];
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -43,7 +43,7 @@
 ## Improvements
 ##

- Cracking bcrypt: Use a feedback from the OpenCL runtime to dynamically find out optimal thread count
+- Cracking bcrypt and Password Safe v2: Use a feedback from the OpenCL runtime to dynamically find out optimal thread count
 - Bitcoin Wallet: Be more user friendly by allowing a larger data range for ckey and public_key
 - Building: Updated BUILD.md
 - My Wallet: Added additional plaintext pattern used in newer versions
--- a/include/types.h
+++ b/include/types.h
@ -1317,6 +1317,7 @@ typedef struct hc_device_param

  bool    has_vperm;
  bool    has_vadd3;
+  bool    has_vbfe;

  double  spin_damp;

--- a/src/modules/module_03200.c
+++ b/src/modules/module_03200.c
@ -80,11 +80,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
  // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
  // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
  // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
-  // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
-  // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
-  // on my development system. no clue where the 4 bytes are spent.
-  // I did some research on this and it seems to be related with the datatype.
-  // For example, if i used u8 instead, there's only 1 byte wasted.

  u32 fixed_local_size = 0;

@ -94,7 +89,20 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
  }
  else
  {
-    fixed_local_size = (device_param->device_local_mem_size - 4) / 4096;
+    u32 overhead = 0;
+
+    if (device_param->device_vendor_id == VENDOR_ID_NV)
+    {
+    // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+    // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+    // on my development system. no clue where the 4 bytes are spent.
+    // I did some research on this and it seems to be related with the datatype.
+    // For example, if i used u8 instead, there's only 1 byte wasted.
+
+      overhead = 4;
+    }
+
+    fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
  }

  hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
--- a/src/modules/module_09000.c
+++ b/src/modules/module_09000.c
@ -70,18 +70,31 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
  return tmp_size;
 }

-u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
-  const u32 kernel_threads_min = 8;  // Blowfish
+  char *jit_build_options = NULL;

-  return kernel_threads_min;
-}
+  u32 fixed_local_size = 0;

-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_max = 8;  // Blowfish
+  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+  {
+    fixed_local_size = 1;
+  }
+  else
+  {
+    u32 overhead = 0;

-  return kernel_threads_max;
+    if (device_param->device_vendor_id == VENDOR_ID_NV)
+    {
+      overhead = 4;
+    }
+
+    fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+  }
+
+  hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
+
+  return jit_build_options;
 }

 bool module_outfile_check_disable (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -183,14 +196,14 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_hook23                   = MODULE_DEFAULT;
  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
  module_ctx->module_hook_size                = MODULE_DEFAULT;
-  module_ctx->module_jit_build_options        = MODULE_DEFAULT;
+  module_ctx->module_jit_build_options        = module_jit_build_options;
  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/modules/module_18600.c
+++ b/src/modules/module_18600.c
@ -62,18 +62,31 @@ typedef struct odf11

 static const char *SIGNATURE_ODF = "$odf$";

-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
-  const u32 kernel_threads_max = 8; // Blowfish enforced
+  char *jit_build_options = NULL;

-  return kernel_threads_max;
-}
+  u32 fixed_local_size = 0;

-u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_min = 8; // Blowfish enforced
+  if (device_param->device_type & CL_DEVICE_TYPE_CPU)
+  {
+    fixed_local_size = 1;
+  }
+  else
+  {
+    u32 overhead = 0;

-  return kernel_threads_min;
+    if (device_param->device_vendor_id == VENDOR_ID_NV)
+    {
+      overhead = 4;
+    }
+
+    fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+  }
+
+  hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
+
+  return jit_build_options;
 }

 u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -335,14 +348,14 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_hook23                   = MODULE_DEFAULT;
  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
  module_ctx->module_hook_size                = MODULE_DEFAULT;
-  module_ctx->module_jit_build_options        = MODULE_DEFAULT;
+  module_ctx->module_jit_build_options        = module_jit_build_options;
  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
  module_ctx->module_opti_type                = module_opti_type;
--- a/src/opencl.c
+++ b/src/opencl.c
@ -3958,6 +3958,10 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
        const bool has_vadd3 = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; __asm__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");

        device_param->has_vadd3 = has_vadd3;
+
+        const bool has_vbfe = test_instruction (hashcat_ctx, context, device_param->device, "__kernel void test () { uint r; __asm__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
+
+        device_param->has_vbfe = has_vbfe;
      }

      // device_available_mem
@ -4852,9 +4856,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
    // we don't have sm_* on vendors not NV but it doesn't matter

    #if defined (DEBUG)
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll ", device_param->device_local_mem_type, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->has_vperm, device_param->has_vadd3, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type);
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll ", device_param->device_local_mem_type, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type);
    #else
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->has_vperm, device_param->has_vadd3, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type);
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%u -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type);
    #endif

    /*