Scrypt Kernels: Reduced kernel wait times by making it a true split kernel where iteration count = N value

2025-07-29 18:08:46 +00:00 · 2021-04-21 15:59:14 +02:00 · 2021-04-21 15:59:14 +02:00 · 15f35fa68c
commit 15f35fa68c
parent 56f47cabe2
20 changed files with 724 additions and 313 deletions
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@ -62,6 +62,7 @@
  MAYBE_UNUSED           const u32            digests_cnt,          \
  MAYBE_UNUSED           const u32            digests_offset_host,  \
  MAYBE_UNUSED           const u32            combs_mode,           \
+  MAYBE_UNUSED           const u32            salt_repeat,          \
  MAYBE_UNUSED           const u64            pws_pos,              \
  MAYBE_UNUSED           const u64            gid_max
 #else
@ -100,6 +101,7 @@
  MAYBE_UNUSED           const u32            digests_cnt,          \
  MAYBE_UNUSED           const u32            digests_offset_host,  \
  MAYBE_UNUSED           const u32            combs_mode,           \
+  MAYBE_UNUSED           const u32            salt_repeat,          \
  MAYBE_UNUSED           const u64            pws_pos,              \
  MAYBE_UNUSED           const u64            gid_max
 #endif
--- a/OpenCL/inc_types.h
+++ b/OpenCL/inc_types.h
@ -1642,6 +1642,7 @@ typedef struct salt
  u32 salt_iter;
  u32 salt_iter2;
  u32 salt_sign[2];
+  u32 salt_repeats;

  u32 orig_pos;

--- a/OpenCL/m08900-pure.cl
+++ b/OpenCL/m08900-pure.cl
@ -170,14 +170,16 @@ DECLSPEC void salsa_r (uint4 *TI)
    TO[idx_r2++] = R3;
  }

+  #ifdef _unroll
  #pragma unroll
+  #endif
  for (int i = 0; i < STATE_CNT4; i++)
  {
    TI[i] = TO[i];
  }
 }

-DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
 {
  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
  #define CO Coord(xd4,y,z)
@ -200,9 +202,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    case 3: V = V3; break;
  }

-  #ifdef _unroll
-  #pragma unroll
-  #endif
  for (u32 i = 0; i < STATE_CNT4; i += 4)
  {
    #ifdef IS_CUDA
@ -230,7 +229,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X);
  }

-  for (u32 i = 0; i < SCRYPT_N; i++)
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+}
+
+DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+{
+  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
+  #define CO Coord(xd4,y,z)
+
+  const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
+  const u32 zSIZE = STATE_CNT4;
+
+  const u32 x = get_global_id (0);
+
+  const u32 xd4 = x / 4;
+  const u32 xm4 = x & 3;
+
+  GLOBAL_AS uint4 *V;
+
+  switch (xm4)
+  {
+    case 0: V = V0; break;
+    case 1: V = V1; break;
+    case 2: V = V2; break;
+    case 3: V = V3; break;
+  }
+
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+
+  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
  {
    const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);

@ -247,9 +310,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    salsa_r (X);
  }

-  #ifdef _unroll
-  #pragma unroll
-  #endif
  for (u32 i = 0; i < STATE_CNT4; i += 4)
  {
    #ifdef IS_CUDA
@ -341,6 +401,41 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t))
  }
 }

+KERNEL_FQ void m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t))
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  // SCRYPT part, init V
+
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
+
+  uint4 X[STATE_CNT4];
+  uint4 T[STATE_CNT4];
+
+  const u32 P_offset = salt_repeat * STATE_CNT4;
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
+
+  scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
+}
+
 KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
 {
  const u64 gid = get_global_id (0);
@ -355,28 +450,19 @@ KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
  uint4 X[STATE_CNT4];
  uint4 T[STATE_CNT4];

-  #ifdef _unroll
-  #pragma unroll
-  #endif
-  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]);
-
-  scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+  const u32 P_offset = salt_repeat * STATE_CNT4;

  #ifdef _unroll
  #pragma unroll
  #endif
-  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]);
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);

-  #if SCRYPT_P >= 1
-  for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4)
-  {
-    for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]);
+  scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);

-    scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
-
-    for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]);
-  }
+  #ifdef _unroll
+  #pragma unroll
  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
 }

 KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
--- a/OpenCL/m15700-pure.cl
+++ b/OpenCL/m15700-pure.cl
@ -184,7 +184,7 @@ DECLSPEC void salsa_r (uint4 *TI)
  }
 }

-DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
 {
  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
  #define CO Coord(xd4,y,z)
@ -207,9 +207,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    case 3: V = V3; break;
  }

-  #ifdef _unroll
-  #pragma unroll
-  #endif
  for (u32 i = 0; i < STATE_CNT4; i += 4)
  {
    #ifdef IS_CUDA
@ -237,7 +234,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X);
  }

-  for (u32 i = 0; i < SCRYPT_N; i++)
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+}
+
+DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+{
+  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
+  #define CO Coord(xd4,y,z)
+
+  const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
+  const u32 zSIZE = STATE_CNT4;
+
+  const u32 x = get_global_id (0);
+
+  const u32 xd4 = x / 4;
+  const u32 xm4 = x & 3;
+
+  GLOBAL_AS uint4 *V;
+
+  switch (xm4)
+  {
+    case 0: V = V0; break;
+    case 1: V = V1; break;
+    case 2: V = V2; break;
+    case 3: V = V3; break;
+  }
+
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+
+  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
  {
    const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);

@ -254,9 +315,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    salsa_r (X);
  }

-  #ifdef _unroll
-  #pragma unroll
-  #endif
  for (u32 i = 0; i < STATE_CNT4; i += 4)
  {
    #ifdef IS_CUDA
@ -477,6 +535,41 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
  }
 }

+KERNEL_FQ void m15700_loop_prepare (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t))
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  // SCRYPT part, init V
+
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
+
+  uint4 X[STATE_CNT4];
+  uint4 T[STATE_CNT4];
+
+  const u32 P_offset = salt_repeat * STATE_CNT4;
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
+
+  scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
+}
+
 KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t))
 {
  const u64 gid = get_global_id (0);
@ -491,28 +584,19 @@ KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
  uint4 X[STATE_CNT4];
  uint4 T[STATE_CNT4];

-  #ifdef _unroll
-  #pragma unroll
-  #endif
-  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]);
-
-  scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+  const u32 P_offset = salt_repeat * STATE_CNT4;

  #ifdef _unroll
  #pragma unroll
  #endif
-  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]);
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);

-  #if SCRYPT_P >= 1
-  for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4)
-  {
-    for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]);
+  scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);

-    scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
-
-    for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]);
-  }
+  #ifdef _unroll
+  #pragma unroll
  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
 }

 KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t))
--- a/OpenCL/m22700-pure.cl
+++ b/OpenCL/m22700-pure.cl
@ -225,7 +225,7 @@ DECLSPEC void salsa_r (uint4 *TI)
  }
 }

-DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
 {
  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
  #define CO Coord(xd4,y,z)
@ -248,9 +248,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    case 3: V = V3; break;
  }

-  #ifdef _unroll
-  #pragma unroll
-  #endif
  for (u32 i = 0; i < STATE_CNT4; i += 4)
  {
    #ifdef IS_CUDA
@ -278,7 +275,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X);
  }

-  for (u32 i = 0; i < SCRYPT_N; i++)
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+}
+
+DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3)
+{
+  #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
+  #define CO Coord(xd4,y,z)
+
+  const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
+  const u32 zSIZE = STATE_CNT4;
+
+  const u32 x = get_global_id (0);
+
+  const u32 xd4 = x / 4;
+  const u32 xm4 = x & 3;
+
+  GLOBAL_AS uint4 *V;
+
+  switch (xm4)
+  {
+    case 0: V = V0; break;
+    case 1: V = V1; break;
+    case 2: V = V2; break;
+    case 3: V = V3; break;
+  }
+
+  for (u32 i = 0; i < STATE_CNT4; i += 4)
+  {
+    #ifdef IS_CUDA
+    T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #else
+    T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w);
+    T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w);
+    T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w);
+    T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w);
+    #endif
+
+    X[i + 0] = T[0];
+    X[i + 1] = T[1];
+    X[i + 2] = T[2];
+    X[i + 3] = T[3];
+  }
+
+  for (u32 N_pos = 0; N_pos < 1024; N_pos++)
  {
    const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);

@ -295,9 +356,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui
    salsa_r (X);
  }

-  #ifdef _unroll
-  #pragma unroll
-  #endif
  for (u32 i = 0; i < STATE_CNT4; i += 4)
  {
    #ifdef IS_CUDA
@ -429,6 +487,41 @@ KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t))
  }
 }

+KERNEL_FQ void m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t))
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  // SCRYPT part, init V
+
+  GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf;
+  GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf;
+  GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf;
+  GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf;
+
+  uint4 X[STATE_CNT4];
+  uint4 T[STATE_CNT4];
+
+  const u32 P_offset = salt_repeat * STATE_CNT4;
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);
+
+  scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
+}
+
 KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
 {
  const u64 gid = get_global_id (0);
@ -443,28 +536,19 @@ KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t))
  uint4 X[STATE_CNT4];
  uint4 T[STATE_CNT4];

-  #ifdef _unroll
-  #pragma unroll
-  #endif
-  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]);
-
-  scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
+  const u32 P_offset = salt_repeat * STATE_CNT4;

  #ifdef _unroll
  #pragma unroll
  #endif
-  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]);
+  for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]);

-  #if SCRYPT_P >= 1
-  for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4)
-  {
-    for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]);
+  scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);

-    scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf);
-
-    for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]);
-  }
+  #ifdef _unroll
+  #pragma unroll
  #endif
+  for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]);
 }

 KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -62,6 +62,7 @@
 - OpenCL Runtime: Workaround JiT compiler deadlock on NVIDIA driver >= 465.89
 - RAR3 Kernels: Improved loop code, improving performance by 23%
 - Startup time: Improved the startup time by avoiding some time intensive operations for skipped devices
+- Scrypt Kernels: Reduced kernel wait times by making it a true split kernel where iteration count = N value

 ##
 ## Technical
--- a/hashcat.hctune
+++ b/hashcat.hctune
@ -369,14 +369,14 @@ GeForce_GTX_TITAN                               3       9900    2       A
 ## SCRYPT
 ##

-DEVICE_TYPE_CPU                                 *       8900    1       N       1
-DEVICE_TYPE_GPU                                 *       8900    1       N       1
-DEVICE_TYPE_CPU                                 *       9300    1       N       1
-DEVICE_TYPE_GPU                                 *       9300    1       N       1
-DEVICE_TYPE_CPU                                 *       15700   1       N       1
-DEVICE_TYPE_GPU                                 *       15700   1       1       1
-DEVICE_TYPE_CPU                                 *       22700   1       N       1
-DEVICE_TYPE_GPU                                 *       22700   1       N       1
+DEVICE_TYPE_CPU                                 *       8900    1       N       A
+DEVICE_TYPE_GPU                                 *       8900    1       N       A
+DEVICE_TYPE_CPU                                 *       9300    1       N       A
+DEVICE_TYPE_GPU                                 *       9300    1       N       A
+DEVICE_TYPE_CPU                                 *       15700   1       N       A
+DEVICE_TYPE_GPU                                 *       15700   1       1       A
+DEVICE_TYPE_CPU                                 *       22700   1       N       A
+DEVICE_TYPE_GPU                                 *       22700   1       N       A

 ## Here's an example of how to manually tune SCRYPT algorithm kernels for your hardware.
 ## Manually tuning the GPU will yield increased performance. There is typically no noticeable change to CPU performance.
@ -466,12 +466,12 @@ DEVICE_TYPE_GPU                                 *       22700   1       N
 ## Find the ideal -n value, then store it here along with the proper compute device name. 
 ## Formatting guidelines are availabe at the top of this document.

-GeForce_GTX_980                                 *       8900    1      28       1
-GeForce_GTX_980                                 *       9300    1     128       1
-GeForce_GTX_980                                 *       15700   1       1       1
-GeForce_GTX_980                                 *       22700   1      28       1
+GeForce_GTX_980                                 *       8900    1      28       A
+GeForce_GTX_980                                 *       9300    1     128       A
+GeForce_GTX_980                                 *       15700   1       1       A
+GeForce_GTX_980                                 *       22700   1      28       A

-GeForce_RTX_2080_Ti                             *       8900    1       N       1
-GeForce_RTX_2080_Ti                             *       9300    1     544       1
-GeForce_RTX_2080_Ti                             *       15700   1       4       1
-GeForce_RTX_2080_Ti                             *       22700   1       N       1
+GeForce_RTX_2080_Ti                             *       8900    1       N       A
+GeForce_RTX_2080_Ti                             *       9300    1     544       A
+GeForce_RTX_2080_Ti                             *       15700   1       4       A
+GeForce_RTX_2080_Ti                             *       22700   1       N       A
--- a/include/types.h
+++ b/include/types.h
@ -257,12 +257,14 @@ typedef enum kern_run
 {
  KERN_RUN_1      = 1000,
  KERN_RUN_12     = 1500,
+  KERN_RUN_2P     = 1999,
  KERN_RUN_2      = 2000,
  KERN_RUN_2E     = 2001,
  KERN_RUN_23     = 2500,
  KERN_RUN_3      = 3000,
  KERN_RUN_4      = 4000,
  KERN_RUN_INIT2  = 5000,
+  KERN_RUN_LOOP2P = 5999,
  KERN_RUN_LOOP2  = 6000,
  KERN_RUN_AUX1   = 7001,
  KERN_RUN_AUX2   = 7002,
@ -412,30 +414,33 @@ typedef enum opts_type
  OPTS_TYPE_ST_BASE64         = (1ULL << 26),
  OPTS_TYPE_HASH_COPY         = (1ULL << 28),
  OPTS_TYPE_HASH_SPLIT        = (1ULL << 29),
-  OPTS_TYPE_LOOP_EXTENDED     = (1ULL << 30), // a kernel which is called each time normal _loop kernel finished.
+  OPTS_TYPE_LOOP_PREPARE      = (1ULL << 30), // a kernel which is called each time before _loop kernel started.
+                                              // like a hook12 kernel but without extra buffers.
+  OPTS_TYPE_LOOP_EXTENDED     = (1ULL << 31), // a kernel which is called each time normal _loop kernel finished.
                                              // but unlike a hook kernel this kernel is called for every _loop iteration offset
-  OPTS_TYPE_HOOK12            = (1ULL << 31),
-  OPTS_TYPE_HOOK23            = (1ULL << 32),
-  OPTS_TYPE_INIT2             = (1ULL << 33),
-  OPTS_TYPE_LOOP2             = (1ULL << 34),
-  OPTS_TYPE_AUX1              = (1ULL << 35),
-  OPTS_TYPE_AUX2              = (1ULL << 36),
-  OPTS_TYPE_AUX3              = (1ULL << 37),
-  OPTS_TYPE_AUX4              = (1ULL << 38),
-  OPTS_TYPE_BINARY_HASHFILE   = (1ULL << 39),
+  OPTS_TYPE_HOOK12            = (1ULL << 32),
+  OPTS_TYPE_HOOK23            = (1ULL << 33),
+  OPTS_TYPE_INIT2             = (1ULL << 34),
+  OPTS_TYPE_LOOP2_PREPARE     = (1ULL << 35), // same as OPTS_TYPE_LOOP_PREPARE but for loop2 kernel
+  OPTS_TYPE_LOOP2             = (1ULL << 36),
+  OPTS_TYPE_AUX1              = (1ULL << 37),
+  OPTS_TYPE_AUX2              = (1ULL << 38),
+  OPTS_TYPE_AUX3              = (1ULL << 39),
+  OPTS_TYPE_AUX4              = (1ULL << 40),
+  OPTS_TYPE_BINARY_HASHFILE   = (1ULL << 41),
  OPTS_TYPE_BINARY_HASHFILE_OPTIONAL
-                              = (1ULL << 40), // this allows us to not enforce the use of a binary file. requires OPTS_TYPE_BINARY_HASHFILE set to be effective.
-  OPTS_TYPE_PT_ADD06          = (1ULL << 41),
-  OPTS_TYPE_KEYBOARD_MAPPING  = (1ULL << 42),
-  OPTS_TYPE_DEEP_COMP_KERNEL  = (1ULL << 43), // if we have to iterate through each hash inside the comp kernel, for example if each hash has to be decrypted separately
-  OPTS_TYPE_TM_KERNEL         = (1ULL << 44),
-  OPTS_TYPE_SUGGEST_KG        = (1ULL << 45), // suggest keep guessing for modules the user maybe wants to use --keep-guessing
-  OPTS_TYPE_COPY_TMPS         = (1ULL << 46), // if we want to use data from tmps buffer (for example get the PMK in WPA)
-  OPTS_TYPE_POTFILE_NOPASS    = (1ULL << 47), // sometimes the password should not be printed to potfile
-  OPTS_TYPE_DYNAMIC_SHARED    = (1ULL << 48), // use dynamic shared memory (note: needs special kernel changes)
-  OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 49), // some algos use JiT in combinations with a salt or create too much startup time
-  OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 50), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
-  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 51), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
+                              = (1ULL << 42), // this allows us to not enforce the use of a binary file. requires OPTS_TYPE_BINARY_HASHFILE set to be effective.
+  OPTS_TYPE_PT_ADD06          = (1ULL << 43),
+  OPTS_TYPE_KEYBOARD_MAPPING  = (1ULL << 44),
+  OPTS_TYPE_DEEP_COMP_KERNEL  = (1ULL << 45), // if we have to iterate through each hash inside the comp kernel, for example if each hash has to be decrypted separately
+  OPTS_TYPE_TM_KERNEL         = (1ULL << 46),
+  OPTS_TYPE_SUGGEST_KG        = (1ULL << 47), // suggest keep guessing for modules the user maybe wants to use --keep-guessing
+  OPTS_TYPE_COPY_TMPS         = (1ULL << 48), // if we want to use data from tmps buffer (for example get the PMK in WPA)
+  OPTS_TYPE_POTFILE_NOPASS    = (1ULL << 49), // sometimes the password should not be printed to potfile
+  OPTS_TYPE_DYNAMIC_SHARED    = (1ULL << 50), // use dynamic shared memory (note: needs special kernel changes)
+  OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 51), // some algos use JiT in combinations with a salt or create too much startup time
+  OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 52), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
+  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 53), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)

 } opts_type_t;

@ -1094,12 +1099,14 @@ typedef struct hc_device_param

  u32     kernel_wgs1;
  u32     kernel_wgs12;
+  u32     kernel_wgs2p;
  u32     kernel_wgs2;
  u32     kernel_wgs2e;
  u32     kernel_wgs23;
  u32     kernel_wgs3;
  u32     kernel_wgs4;
  u32     kernel_wgs_init2;
+  u32     kernel_wgs_loop2p;
  u32     kernel_wgs_loop2;
  u32     kernel_wgs_mp;
  u32     kernel_wgs_mp_l;
@ -1116,12 +1123,14 @@ typedef struct hc_device_param

  u32     kernel_preferred_wgs_multiple1;
  u32     kernel_preferred_wgs_multiple12;
+  u32     kernel_preferred_wgs_multiple2p;
  u32     kernel_preferred_wgs_multiple2;
  u32     kernel_preferred_wgs_multiple2e;
  u32     kernel_preferred_wgs_multiple23;
  u32     kernel_preferred_wgs_multiple3;
  u32     kernel_preferred_wgs_multiple4;
  u32     kernel_preferred_wgs_multiple_init2;
+  u32     kernel_preferred_wgs_multiple_loop2p;
  u32     kernel_preferred_wgs_multiple_loop2;
  u32     kernel_preferred_wgs_multiple_mp;
  u32     kernel_preferred_wgs_multiple_mp_l;
@ -1138,12 +1147,14 @@ typedef struct hc_device_param

  u64     kernel_local_mem_size1;
  u64     kernel_local_mem_size12;
+  u64     kernel_local_mem_size2p;
  u64     kernel_local_mem_size2;
  u64     kernel_local_mem_size2e;
  u64     kernel_local_mem_size23;
  u64     kernel_local_mem_size3;
  u64     kernel_local_mem_size4;
  u64     kernel_local_mem_size_init2;
+  u64     kernel_local_mem_size_loop2p;
  u64     kernel_local_mem_size_loop2;
  u64     kernel_local_mem_size_mp;
  u64     kernel_local_mem_size_mp_l;
@ -1160,12 +1171,14 @@ typedef struct hc_device_param

  u64     kernel_dynamic_local_mem_size1;
  u64     kernel_dynamic_local_mem_size12;
+  u64     kernel_dynamic_local_mem_size2p;
  u64     kernel_dynamic_local_mem_size2;
  u64     kernel_dynamic_local_mem_size2e;
  u64     kernel_dynamic_local_mem_size23;
  u64     kernel_dynamic_local_mem_size3;
  u64     kernel_dynamic_local_mem_size4;
  u64     kernel_dynamic_local_mem_size_init2;
+  u64     kernel_dynamic_local_mem_size_loop2p;
  u64     kernel_dynamic_local_mem_size_loop2;
  u64     kernel_dynamic_local_mem_size_mp;
  u64     kernel_dynamic_local_mem_size_mp_l;
@ -1273,11 +1286,13 @@ typedef struct hc_device_param
  // workaround cpu spinning

  double  exec_us_prev1[EXPECTED_ITERATIONS];
+  double  exec_us_prev2p[EXPECTED_ITERATIONS];
  double  exec_us_prev2[EXPECTED_ITERATIONS];
  double  exec_us_prev2e[EXPECTED_ITERATIONS];
  double  exec_us_prev3[EXPECTED_ITERATIONS];
  double  exec_us_prev4[EXPECTED_ITERATIONS];
  double  exec_us_prev_init2[EXPECTED_ITERATIONS];
+  double  exec_us_prev_loop2p[EXPECTED_ITERATIONS];
  double  exec_us_prev_loop2[EXPECTED_ITERATIONS];
  double  exec_us_prev_aux1[EXPECTED_ITERATIONS];
  double  exec_us_prev_aux2[EXPECTED_ITERATIONS];
@ -1378,12 +1393,14 @@ typedef struct hc_device_param

  CUfunction        cuda_function1;
  CUfunction        cuda_function12;
+  CUfunction        cuda_function2p;
  CUfunction        cuda_function2;
  CUfunction        cuda_function2e;
  CUfunction        cuda_function23;
  CUfunction        cuda_function3;
  CUfunction        cuda_function4;
  CUfunction        cuda_function_init2;
+  CUfunction        cuda_function_loop2p;
  CUfunction        cuda_function_loop2;
  CUfunction        cuda_function_mp;
  CUfunction        cuda_function_mp_l;
@ -1462,12 +1479,14 @@ typedef struct hc_device_param

  cl_kernel         opencl_kernel1;
  cl_kernel         opencl_kernel12;
+  cl_kernel         opencl_kernel2p;
  cl_kernel         opencl_kernel2;
  cl_kernel         opencl_kernel2e;
  cl_kernel         opencl_kernel23;
  cl_kernel         opencl_kernel3;
  cl_kernel         opencl_kernel4;
  cl_kernel         opencl_kernel_init2;
+  cl_kernel         opencl_kernel_loop2p;
  cl_kernel         opencl_kernel_loop2;
  cl_kernel         opencl_kernel_mp;
  cl_kernel         opencl_kernel_mp_l;
--- a/src/backend.c
+++ b/src/backend.c
@ -2998,11 +2998,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
  }
  else
  {
-    bool run_init = true;
-    bool run_loop = true;
-    bool run_comp = true;
-
-    if (run_init == true)
+    if (true)
    {
      if (device_param->is_cuda == true)
      {
@ -3089,165 +3085,190 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
      }
    }

-    if (run_loop == true)
+    if (true)
    {
-      u32 iter = hashes->salts_buf[salt_pos].salt_iter;
+      const u32 salt_repeats = hashes->salts_buf[salt_pos].salt_repeats;

-      u32 loop_step = device_param->kernel_loops;
-
-      for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+      for (u32 salt_repeat = 0; salt_repeat <= salt_repeats; salt_repeat++)
      {
-        u32 loop_left = iter - loop_pos;
+        device_param->kernel_params_buf32[34] = salt_repeat;

-        loop_left = MIN (loop_left, loop_step);
-
-        device_param->kernel_params_buf32[28] = loop_pos;
-        device_param->kernel_params_buf32[29] = loop_left;
-
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
-
-        if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
        {
-          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, pws_pos, pws_cnt, false, 0) == -1) return -1;
        }

-        //bug?
-        //while (status_ctx->run_thread_level2 == false) break;
-        if (status_ctx->run_thread_level2 == false) break;
-
-        /**
-         * speed
-         */
-
-        const float iter_part = (float) (loop_pos + loop_left) / iter;
-
-        const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
-
-        double speed_msec = hc_timer_get (device_param->timer_speed);
-
-        const u32 speed_pos = device_param->speed_pos;
-
-        device_param->speed_cnt[speed_pos] = perf_sum_all;
-
-        device_param->speed_msec[speed_pos] = speed_msec;
-
-        if (user_options->speed_only == true)
+        if (true)
        {
-          if (speed_msec > 4000)
+          const u32 iter = hashes->salts_buf[salt_pos].salt_iter;
+
+          const u32 loop_step = device_param->kernel_loops;
+
+          for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
          {
-            device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
+            u32 loop_left = iter - loop_pos;

-            device_param->speed_pos = 1;
+            loop_left = MIN (loop_left, loop_step);

-            device_param->speed_only_finish = true;
+            device_param->kernel_params_buf32[28] = loop_pos;
+            device_param->kernel_params_buf32[29] = loop_left;

-            return 0;
+            if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+
+            if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
+            {
+              if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+            }
+
+            //bug?
+            //while (status_ctx->run_thread_level2 == false) break;
+            if (status_ctx->run_thread_level2 == false) break;
+
+            /**
+             * speed
+             */
+
+            const float iter_part = (float) (loop_pos + loop_left) / iter;
+
+            const u64 perf_sum_all = (u64) (pws_cnt * iter_part);
+
+            double speed_msec = hc_timer_get (device_param->timer_speed);
+
+            const u32 speed_pos = device_param->speed_pos;
+
+            device_param->speed_cnt[speed_pos] = perf_sum_all;
+
+            device_param->speed_msec[speed_pos] = speed_msec;
+
+            if (user_options->speed_only == true)
+            {
+              if (speed_msec > 4000)
+              {
+                device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left);
+
+                device_param->speed_pos = 1;
+
+                device_param->speed_only_finish = true;
+
+                return 0;
+              }
+            }
+          }
+
+          if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
+          {
+            if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_pos, pws_cnt, false, 0) == -1) return -1;
+
+            if (device_param->is_cuda == true)
+            {
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
+            }
+
+            if (device_param->is_opencl == true)
+            {
+              if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+            }
+
+            const int hook_threads = (int) user_options->hook_threads;
+
+            hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
+
+            for (int i = 0; i < hook_threads; i++)
+            {
+              hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+
+              hook_thread_param->tid = i;
+              hook_thread_param->tsz = hook_threads;
+
+              hook_thread_param->module_ctx = module_ctx;
+              hook_thread_param->status_ctx = status_ctx;
+
+              hook_thread_param->device_param = device_param;
+
+              hook_thread_param->hook_extra_param = module_ctx->hook_extra_params[i];
+              hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
+
+              hook_thread_param->salt_pos = salt_pos;
+
+              hook_thread_param->pws_cnt = pws_cnt;
+            }
+
+            hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
+
+            for (int i = 0; i < hook_threads; i++)
+            {
+              hook_thread_param_t *hook_thread_param = hook_threads_param + i;
+
+              hc_thread_create (c_threads[i], hook23_thread, hook_thread_param);
+            }
+
+            hc_thread_wait (hook_threads, c_threads);
+
+            hcfree (c_threads);
+
+            hcfree (hook_threads_param);
+
+            if (device_param->is_cuda == true)
+            {
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
+            }
+
+            if (device_param->is_opencl == true)
+            {
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+            }
          }
        }
      }
-
-      if (hashconfig->opts_type & OPTS_TYPE_HOOK23)
-      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_pos, pws_cnt, false, 0) == -1) return -1;
-
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
-
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
-
-        const int hook_threads = (int) user_options->hook_threads;
-
-        hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t));
-
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
-
-          hook_thread_param->tid = i;
-          hook_thread_param->tsz = hook_threads;
-
-          hook_thread_param->module_ctx = module_ctx;
-          hook_thread_param->status_ctx = status_ctx;
-
-          hook_thread_param->device_param = device_param;
-
-          hook_thread_param->hook_extra_param = module_ctx->hook_extra_params[i];
-          hook_thread_param->hook_salts_buf = hashes->hook_salts_buf;
-
-          hook_thread_param->salt_pos = salt_pos;
-
-          hook_thread_param->pws_cnt = pws_cnt;
-        }
-
-        hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t));
-
-        for (int i = 0; i < hook_threads; i++)
-        {
-          hook_thread_param_t *hook_thread_param = hook_threads_param + i;
-
-          hc_thread_create (c_threads[i], hook23_thread, hook_thread_param);
-        }
-
-        hc_thread_wait (hook_threads, c_threads);
-
-        hcfree (c_threads);
-
-        hcfree (hook_threads_param);
-
-        if (device_param->is_cuda == true)
-        {
-          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
-        }
-
-        if (device_param->is_opencl == true)
-        {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
-        }
-      }
    }

-    // init2 and loop2 are kind of special, we use run_loop for them, too
+    // note: they also do not influence the performance screen
+    // in case you want to use this, this cane make sense only if your input data comes out of tmps[]

-    if (run_loop == true)
+    if (hashconfig->opts_type & OPTS_TYPE_INIT2)
    {
-      // note: they also do not influence the performance screen
-      // in case you want to use this, this cane make sense only if your input data comes out of tmps[]
+      if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_pos, pws_cnt, false, 0) == -1) return -1;
+    }

-      if (hashconfig->opts_type & OPTS_TYPE_INIT2)
+    if (true)
+    {
+      const u32 salt_repeats = hashes->salts_buf[salt_pos].salt_repeats;
+
+      for (u32 salt_repeat = 0; salt_repeat <= salt_repeats; salt_repeat++)
      {
-        if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_pos, pws_cnt, false, 0) == -1) return -1;
-      }
+        device_param->kernel_params_buf32[34] = salt_repeat;

-      if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
-      {
-        u32 iter = hashes->salts_buf[salt_pos].salt_iter2;
-
-        u32 loop_step = device_param->kernel_loops;
-
-        for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE)
        {
-          u32 loop_left = iter - loop_pos;
+          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2P, pws_pos, pws_cnt, false, 0) == -1) return -1;
+        }

-          loop_left = MIN (loop_left, loop_step);
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
+        {
+          u32 iter = hashes->salts_buf[salt_pos].salt_iter2;

-          device_param->kernel_params_buf32[28] = loop_pos;
-          device_param->kernel_params_buf32[29] = loop_left;
+          u32 loop_step = device_param->kernel_loops;

-          if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+          for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++)
+          {
+            u32 loop_left = iter - loop_pos;

-          //bug?
-          //while (status_ctx->run_thread_level2 == false) break;
-          if (status_ctx->run_thread_level2 == false) break;
+            loop_left = MIN (loop_left, loop_step);
+
+            device_param->kernel_params_buf32[28] = loop_pos;
+            device_param->kernel_params_buf32[29] = loop_left;
+
+            if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1;
+
+            //bug?
+            //while (status_ctx->run_thread_level2 == false) break;
+            if (status_ctx->run_thread_level2 == false) break;
+          }
        }
      }
    }

-    if (run_comp == true)
+    if (true)
    {
      if (hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL)
      {
@ -3525,6 +3546,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      kernel_threads     = device_param->kernel_wgs12;
      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size12;
      break;
+    case KERN_RUN_2P:
+      kernel_threads     = device_param->kernel_wgs2p;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2p;
+      break;
    case KERN_RUN_2:
      kernel_threads     = device_param->kernel_wgs2;
      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2;
@ -3549,6 +3574,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      kernel_threads     = device_param->kernel_wgs_init2;
      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_init2;
      break;
+    case KERN_RUN_LOOP2P:
+      kernel_threads     = device_param->kernel_wgs_loop2p;
+      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2p;
+      break;
    case KERN_RUN_LOOP2:
      kernel_threads     = device_param->kernel_wgs_loop2;
      dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2;
@ -3590,8 +3619,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con

  kernel_threads = MIN (kernel_threads, device_param->kernel_threads);

-  device_param->kernel_params_buf64[34] = pws_pos;
-  device_param->kernel_params_buf64[35] = num;
+  device_param->kernel_params_buf64[35] = pws_pos;
+  device_param->kernel_params_buf64[36] = num;

  u64 num_elements = num;

@ -3603,19 +3632,21 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
    {
      switch (kern_run)
      {
-        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;      break;
-        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;     break;
-        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;      break;
-        case KERN_RUN_2E:     cuda_function = device_param->cuda_function2e;     break;
-        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;     break;
-        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;      break;
-        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;      break;
-        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2; break;
-        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2; break;
-        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;  break;
-        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;  break;
-        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;  break;
-        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;  break;
+        case KERN_RUN_1:      cuda_function = device_param->cuda_function1;       break;
+        case KERN_RUN_12:     cuda_function = device_param->cuda_function12;      break;
+        case KERN_RUN_2P:     cuda_function = device_param->cuda_function2p;      break;
+        case KERN_RUN_2:      cuda_function = device_param->cuda_function2;       break;
+        case KERN_RUN_2E:     cuda_function = device_param->cuda_function2e;      break;
+        case KERN_RUN_23:     cuda_function = device_param->cuda_function23;      break;
+        case KERN_RUN_3:      cuda_function = device_param->cuda_function3;       break;
+        case KERN_RUN_4:      cuda_function = device_param->cuda_function4;       break;
+        case KERN_RUN_INIT2:  cuda_function = device_param->cuda_function_init2;  break;
+        case KERN_RUN_LOOP2P: cuda_function = device_param->cuda_function_loop2p; break;
+        case KERN_RUN_LOOP2:  cuda_function = device_param->cuda_function_loop2;  break;
+        case KERN_RUN_AUX1:   cuda_function = device_param->cuda_function_aux1;   break;
+        case KERN_RUN_AUX2:   cuda_function = device_param->cuda_function_aux2;   break;
+        case KERN_RUN_AUX3:   cuda_function = device_param->cuda_function_aux3;   break;
+        case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;   break;
      }

      if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
@ -3700,19 +3731,21 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
    {
      switch (kern_run)
      {
-        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;      break;
-        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;     break;
-        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;      break;
-        case KERN_RUN_2E:     opencl_kernel = device_param->opencl_kernel2e;     break;
-        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;     break;
-        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;      break;
-        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;      break;
-        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2; break;
-        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2; break;
-        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;  break;
-        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;  break;
-        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;  break;
-        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;  break;
+        case KERN_RUN_1:      opencl_kernel = device_param->opencl_kernel1;       break;
+        case KERN_RUN_12:     opencl_kernel = device_param->opencl_kernel12;      break;
+        case KERN_RUN_2P:     opencl_kernel = device_param->opencl_kernel2p;      break;
+        case KERN_RUN_2:      opencl_kernel = device_param->opencl_kernel2;       break;
+        case KERN_RUN_2E:     opencl_kernel = device_param->opencl_kernel2e;      break;
+        case KERN_RUN_23:     opencl_kernel = device_param->opencl_kernel23;      break;
+        case KERN_RUN_3:      opencl_kernel = device_param->opencl_kernel3;       break;
+        case KERN_RUN_4:      opencl_kernel = device_param->opencl_kernel4;       break;
+        case KERN_RUN_INIT2:  opencl_kernel = device_param->opencl_kernel_init2;  break;
+        case KERN_RUN_LOOP2P: opencl_kernel = device_param->opencl_kernel_loop2p; break;
+        case KERN_RUN_LOOP2:  opencl_kernel = device_param->opencl_kernel_loop2;  break;
+        case KERN_RUN_AUX1:   opencl_kernel = device_param->opencl_kernel_aux1;   break;
+        case KERN_RUN_AUX2:   opencl_kernel = device_param->opencl_kernel_aux2;   break;
+        case KERN_RUN_AUX3:   opencl_kernel = device_param->opencl_kernel_aux3;   break;
+        case KERN_RUN_AUX4:   opencl_kernel = device_param->opencl_kernel_aux4;   break;
      }
    }

@ -3721,12 +3754,12 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]) == -1) return -1;
    }

-    for (u32 i = 24; i <= 33; i++)
+    for (u32 i = 24; i <= 34; i++)
    {
      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]) == -1) return -1;
    }

-    for (u32 i = 34; i <= 35; i++)
+    for (u32 i = 35; i <= 36; i++)
    {
      if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]) == -1) return -1;
    }
@ -3786,17 +3819,19 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
        {
          switch (kern_run)
          {
-            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_2E:     if (device_param->exec_us_prev2e[iterationm]     > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm]     * device_param->spin_damp)); break;
-            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]      * device_param->spin_damp)); break;
-            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break;
-            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break;
-            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]  * device_param->spin_damp)); break;
-            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_1:      if (device_param->exec_us_prev1[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_2P:     if (device_param->exec_us_prev2p[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2p[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_2:      if (device_param->exec_us_prev2[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_2E:     if (device_param->exec_us_prev2e[iterationm]      > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm]      * device_param->spin_damp)); break;
+            case KERN_RUN_3:      if (device_param->exec_us_prev3[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_4:      if (device_param->exec_us_prev4[iterationm]       > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm]       * device_param->spin_damp)); break;
+            case KERN_RUN_INIT2:  if (device_param->exec_us_prev_init2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2P: if (device_param->exec_us_prev_loop2p[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2p[iterationm] * device_param->spin_damp)); break;
+            case KERN_RUN_LOOP2:  if (device_param->exec_us_prev_loop2[iterationm]  > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm]  * device_param->spin_damp)); break;
+            case KERN_RUN_AUX1:   if (device_param->exec_us_prev_aux1[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm]   * device_param->spin_damp)); break;
+            case KERN_RUN_AUX2:   if (device_param->exec_us_prev_aux2[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm]   * device_param->spin_damp)); break;
+            case KERN_RUN_AUX3:   if (device_param->exec_us_prev_aux3[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm]   * device_param->spin_damp)); break;
+            case KERN_RUN_AUX4:   if (device_param->exec_us_prev_aux4[iterationm]   > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm]   * device_param->spin_damp)); break;
          }
        }
        else
@ -3830,17 +3865,19 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
      {
        switch (kern_run)
        {
-          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]      = exec_us; break;
-          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]      = exec_us; break;
-          case KERN_RUN_2E:     device_param->exec_us_prev2e[iterationm]     = exec_us; break;
-          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]      = exec_us; break;
-          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]      = exec_us; break;
-          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm] = exec_us; break;
-          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm] = exec_us; break;
-          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]  = exec_us; break;
-          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]  = exec_us; break;
+          case KERN_RUN_1:      device_param->exec_us_prev1[iterationm]       = exec_us; break;
+          case KERN_RUN_2P:     device_param->exec_us_prev2p[iterationm]      = exec_us; break;
+          case KERN_RUN_2:      device_param->exec_us_prev2[iterationm]       = exec_us; break;
+          case KERN_RUN_2E:     device_param->exec_us_prev2e[iterationm]      = exec_us; break;
+          case KERN_RUN_3:      device_param->exec_us_prev3[iterationm]       = exec_us; break;
+          case KERN_RUN_4:      device_param->exec_us_prev4[iterationm]       = exec_us; break;
+          case KERN_RUN_INIT2:  device_param->exec_us_prev_init2[iterationm]  = exec_us; break;
+          case KERN_RUN_LOOP2P: device_param->exec_us_prev_loop2p[iterationm] = exec_us; break;
+          case KERN_RUN_LOOP2:  device_param->exec_us_prev_loop2[iterationm]  = exec_us; break;
+          case KERN_RUN_AUX1:   device_param->exec_us_prev_aux1[iterationm]   = exec_us; break;
+          case KERN_RUN_AUX2:   device_param->exec_us_prev_aux2[iterationm]   = exec_us; break;
+          case KERN_RUN_AUX3:   device_param->exec_us_prev_aux3[iterationm]   = exec_us; break;
+          case KERN_RUN_AUX4:   device_param->exec_us_prev_aux4[iterationm]   = exec_us; break;
        }
      }
    }
@ -9086,8 +9123,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
    device_param->kernel_params_buf32[31] = 0; // digests_cnt
    device_param->kernel_params_buf32[32] = 0; // digests_offset
    device_param->kernel_params_buf32[33] = 0; // combs_mode
-    device_param->kernel_params_buf64[34] = 0; // pws_pos
-    device_param->kernel_params_buf64[35] = 0; // gid_max
+    device_param->kernel_params_buf32[34] = 0; // salt_repeat
+    device_param->kernel_params_buf64[35] = 0; // pws_pos
+    device_param->kernel_params_buf64[36] = 0; // gid_max

    if (device_param->is_cuda == true)
    {
@ -9155,8 +9193,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
    device_param->kernel_params[31] = &device_param->kernel_params_buf32[31];
    device_param->kernel_params[32] = &device_param->kernel_params_buf32[32];
    device_param->kernel_params[33] = &device_param->kernel_params_buf32[33];
-    device_param->kernel_params[34] = &device_param->kernel_params_buf64[34];
+    device_param->kernel_params[34] = &device_param->kernel_params_buf32[34];
    device_param->kernel_params[35] = &device_param->kernel_params_buf64[35];
+    device_param->kernel_params[36] = &device_param->kernel_params_buf64[36];

    if (user_options->slow_candidates == true)
    {
@ -9554,6 +9593,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

        device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size;

+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
+        {
+          // kernel2p
+
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_prepare", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2p, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_wgs2p) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_local_mem_size2p) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_dynamic_local_mem_size2p) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple2p = device_param->cuda_warp_size;
+        }
+
        if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
        {
          // kernel2e
@ -9622,6 +9678,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
          device_param->kernel_preferred_wgs_multiple_init2 = device_param->cuda_warp_size;
        }

+        // loop2 prepare
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2_prepare", kern_type);
+
+          if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_loop2p, device_param->cuda_module, kernel_name) == -1) return -1;
+
+          if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_wgs_loop2p) == -1) return -1;
+
+          if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_local_mem_size_loop2p) == -1) return -1;
+
+          if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_dynamic_local_mem_size_loop2p) == -1) return -1;
+
+          device_param->kernel_preferred_wgs_multiple_loop2p = device_param->cuda_warp_size;
+        }
+
        // loop2

        if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
@ -10142,6 +10215,21 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)

        // aux1

+        if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_prepare", kern_type);
+
+          if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel2p) == -1) return -1;
+
+          if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_wgs2p) == -1) return -1;
+
+          if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_local_mem_size2p) == -1) return -1;
+
+          if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_dynamic_local_mem_size2p) == -1) return -1;
+
+          if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_preferred_wgs_multiple2p) == -1) return -1;
+        }
+
        if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED)
        {
          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_extended", kern_type);
@ -10208,6 +10296,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
          if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_preferred_wgs_multiple_init2) == -1) return -1;
        }

+        // loop2 prepare
+
+        if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE)
+        {
+          snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2_prepare", kern_type);
+
+          if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_wgs_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_local_mem_size_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_dynamic_local_mem_size_loop2p) == -1) return -1;
+
+          if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_preferred_wgs_multiple_loop2p) == -1) return -1;
+        }
+
        // loop2

        if (hashconfig->opts_type & OPTS_TYPE_LOOP2)
@ -11071,12 +11176,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)

      device_param->cuda_function1            = NULL;
      device_param->cuda_function12           = NULL;
+      device_param->cuda_function2p           = NULL;
      device_param->cuda_function2            = NULL;
      device_param->cuda_function2e           = NULL;
      device_param->cuda_function23           = NULL;
      device_param->cuda_function3            = NULL;
      device_param->cuda_function4            = NULL;
      device_param->cuda_function_init2       = NULL;
+      device_param->cuda_function_loop2p      = NULL;
      device_param->cuda_function_loop2       = NULL;
      device_param->cuda_function_mp          = NULL;
      device_param->cuda_function_mp_l        = NULL;
@ -11139,12 +11246,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)

      if (device_param->opencl_kernel1)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel1);
      if (device_param->opencl_kernel12)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel12);
+      if (device_param->opencl_kernel2p)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2p);
      if (device_param->opencl_kernel2)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2);
      if (device_param->opencl_kernel2e)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2e);
      if (device_param->opencl_kernel23)         hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel23);
      if (device_param->opencl_kernel3)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel3);
      if (device_param->opencl_kernel4)          hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel4);
      if (device_param->opencl_kernel_init2)     hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_init2);
+      if (device_param->opencl_kernel_loop2p)    hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_loop2p);
      if (device_param->opencl_kernel_loop2)     hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_loop2);
      if (device_param->opencl_kernel_mp)        hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp);
      if (device_param->opencl_kernel_mp_l)      hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp_l);
@ -11205,12 +11314,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
      device_param->opencl_d_st_esalts_buf     = NULL;
      device_param->opencl_kernel1             = NULL;
      device_param->opencl_kernel12            = NULL;
+      device_param->opencl_kernel2p            = NULL;
      device_param->opencl_kernel2             = NULL;
      device_param->opencl_kernel2e            = NULL;
      device_param->opencl_kernel23            = NULL;
      device_param->opencl_kernel3             = NULL;
      device_param->opencl_kernel4             = NULL;
      device_param->opencl_kernel_init2        = NULL;
+      device_param->opencl_kernel_loop2p       = NULL;
      device_param->opencl_kernel_loop2        = NULL;
      device_param->opencl_kernel_mp           = NULL;
      device_param->opencl_kernel_mp_l         = NULL;
--- a/src/modules/module_02500.c
+++ b/src/modules/module_02500.c
@ -579,6 +579,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
    1,                  // digests_cnt
    0,                  // digests_offset
    0,                  // combs_mode
+    0,                  // salt_repeat
    0,                  // pws_pos
    1                   // gid_max
  );
--- a/src/modules/module_02501.c
+++ b/src/modules/module_02501.c
@ -554,6 +554,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
    1,                  // digests_cnt
    0,                  // digests_offset
    0,                  // combs_mode
+    0,                  // salt_repeat
    0,                  // pws_pos
    1                   // gid_max
  );
--- a/src/modules/module_03200.c
+++ b/src/modules/module_03200.c
@ -21,6 +21,7 @@ static const char *HASH_NAME      = "bcrypt $2*$, Blowfish (Unix)";
 static const u64   KERN_TYPE      = 3200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
+                                  | OPTS_TYPE_MP_MULTI_DISABLE
                                  | OPTS_TYPE_DYNAMIC_SHARED;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
--- a/src/modules/module_08900.c
+++ b/src/modules/module_08900.c
@ -24,6 +24,7 @@ static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                  | OPTS_TYPE_MP_MULTI_DISABLE
                                  | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                  | OPTS_TYPE_SELF_TEST_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
@ -63,14 +64,14 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE

 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;

  return kernel_loops_min;
 }

 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;

  return kernel_loops_max;
 }
@ -330,6 +331,11 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
  salt->scrypt_r = hc_strtoul ((const char *) r_pos, NULL, 10);
  salt->scrypt_p = hc_strtoul ((const char *) p_pos, NULL, 10);

+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+
+  if (salt->scrypt_N % 1024) return (PARSER_SALT_VALUE); // we set loop count to 1024 fixed
+
  // salt

  const u8 *salt_pos = token.buf[4];
@ -341,8 +347,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE

  memcpy (salt->salt_buf, tmp_buf, tmp_len);

-  salt->salt_len  = tmp_len;
-  salt->salt_iter = 1;
+  salt->salt_len = tmp_len;

  // digest - base64 decode

--- a/src/modules/module_09300.c
+++ b/src/modules/module_09300.c
@ -24,6 +24,7 @@ static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                  | OPTS_TYPE_MP_MULTI_DISABLE
                                  | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                  | OPTS_TYPE_SELF_TEST_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
@ -52,14 +53,14 @@ static const u64 SCRYPT_P = 1;

 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;

  return kernel_loops_min;
 }

 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;

  return kernel_loops_max;
 }
@ -299,11 +300,14 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
  memcpy (salt_buf_ptr, salt_pos, salt_len);

  salt->salt_len  = salt_len;
-  salt->salt_iter = 1;

-  salt->scrypt_N  = 16384;
-  salt->scrypt_r  = 1;
-  salt->scrypt_p  = 1;
+  salt->scrypt_N  = SCRYPT_N;
+  salt->scrypt_r  = SCRYPT_R;
+  salt->scrypt_p  = SCRYPT_P;
+
+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+

  // base64 decode hash

--- a/src/modules/module_15700.c
+++ b/src/modules/module_15700.c
@ -24,6 +24,7 @@ static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                  | OPTS_TYPE_MP_MULTI_DISABLE
                                  | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                  | OPTS_TYPE_SELF_TEST_DISABLE
                                  | OPTS_TYPE_ST_HEX;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
@ -60,14 +61,14 @@ static const u64 SCRYPT_P = 1;

 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;

  return kernel_loops_min;
 }

 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;

  return kernel_loops_max;
 }
@ -349,6 +350,11 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
  salt->scrypt_r = scrypt_r;
  salt->scrypt_p = scrypt_p;

+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+
+  if (salt->scrypt_N % 1024) return (PARSER_SALT_VALUE); // we set loop count to 1024 fixed
+
  // salt

  const u8 *salt_pos = token.buf[4];
@ -367,8 +373,6 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
  ethereum_scrypt->salt_buf[6] = salt->salt_buf[6];
  ethereum_scrypt->salt_buf[7] = salt->salt_buf[7];

-  salt->salt_iter = 1;
-
  // ciphertext

  const u8 *ciphertext_pos = token.buf[5];
--- a/src/modules/module_16800.c
+++ b/src/modules/module_16800.c
@ -290,6 +290,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
    1,                  // digests_cnt
    0,                  // digests_offset
    0,                  // combs_mode
+    0,                  // salt_repeat
    0,                  // pws_pos
    1                   // gid_max
  );
--- a/src/modules/module_16801.c
+++ b/src/modules/module_16801.c
@ -312,6 +312,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
    1,                  // digests_cnt
    0,                  // digests_offset
    0,                  // combs_mode
+    0,                  // salt_repeat
    0,                  // pws_pos
    1                   // gid_max
  );
--- a/src/modules/module_22000.c
+++ b/src/modules/module_22000.c
@ -600,6 +600,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
    1,                  // digests_cnt
    0,                  // digests_offset
    0,                  // combs_mode
+    0,                  // salt_repeat
    0,                  // pws_pos
    1                   // gid_max
  );
--- a/src/modules/module_22001.c
+++ b/src/modules/module_22001.c
@ -601,6 +601,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M
    1,                  // digests_cnt
    0,                  // digests_offset
    0,                  // combs_mode
+    0,                  // salt_repeat
    0,                  // pws_pos
    1                   // gid_max
  );
--- a/src/modules/module_22700.c
+++ b/src/modules/module_22700.c
@ -25,6 +25,7 @@ static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_BE
                                  | OPTS_TYPE_PT_UTF16BE
                                  | OPTS_TYPE_MP_MULTI_DISABLE
                                  | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_LOOP_PREPARE
                                  | OPTS_TYPE_SELF_TEST_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
@ -64,14 +65,14 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE

 u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_min = 1;
+  const u32 kernel_loops_min = 1024;

  return kernel_loops_min;
 }

 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_loops_max = 1;
+  const u32 kernel_loops_max = 1024;

  return kernel_loops_max;
 }
@ -320,6 +321,9 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
  salt->scrypt_r = SCRYPT_R;
  salt->scrypt_p = SCRYPT_P;

+  salt->salt_iter    = salt->scrypt_N;
+  salt->salt_repeats = salt->scrypt_p - 1;
+
  // version

  const u8 *version_pos = token.buf[1];
@ -353,8 +357,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
  salt->salt_buf[10] = hex_to_u32 (b2_pos + 16);
  salt->salt_buf[11] = hex_to_u32 (b2_pos + 24);

-  salt->salt_len  = 48;
-  salt->salt_iter =  1;
+  salt->salt_len = 48;

  // fake digest: