From 0a5575cde5de93f988208fca238bca3059cffc46 Mon Sep 17 00:00:00 2001
From: tweqx <romla@sfr.fr>
Date: Sat, 21 May 2022 15:07:51 +0200
Subject: [PATCH 1/5] Add BLAKE2b-512($salt.$pass) and BLAKE2b-512($pass.$salt)

---
 OpenCL/inc_hash_blake2b.cl    |  30 ++
 OpenCL/inc_hash_blake2b.h     |   1 +
 OpenCL/m00610_a0-optimized.cl | 360 +++++++++++++++++++++
 OpenCL/m00610_a0-pure.cl      | 133 ++++++++
 OpenCL/m00610_a1-optimized.cl | 478 +++++++++++++++++++++++++++
 OpenCL/m00610_a1-pure.cl      | 131 ++++++++
 OpenCL/m00610_a3-optimized.cl | 534 ++++++++++++++++++++++++++++++
 OpenCL/m00610_a3-pure.cl      | 158 +++++++++
 OpenCL/m00620_a0-optimized.cl | 316 ++++++++++++++++++
 OpenCL/m00620_a0-pure.cl      | 121 +++++++
 OpenCL/m00620_a1-optimized.cl | 434 +++++++++++++++++++++++++
 OpenCL/m00620_a1-pure.cl      | 114 +++++++
 OpenCL/m00620_a3-optimized.cl | 590 ++++++++++++++++++++++++++++++++++
 OpenCL/m00620_a3-pure.cl      | 148 +++++++++
 docs/changes.txt              |   2 +
 docs/readme.txt               |   2 +
 src/modules/module_00610.c    | 221 +++++++++++++
 src/modules/module_00620.c    | 221 +++++++++++++
 tools/test_modules/m00610.pm  |  44 +++
 tools/test_modules/m00620.pm  |  44 +++
 20 files changed, 4082 insertions(+)
 create mode 100644 OpenCL/m00610_a0-optimized.cl
 create mode 100644 OpenCL/m00610_a0-pure.cl
 create mode 100644 OpenCL/m00610_a1-optimized.cl
 create mode 100644 OpenCL/m00610_a1-pure.cl
 create mode 100644 OpenCL/m00610_a3-optimized.cl
 create mode 100644 OpenCL/m00610_a3-pure.cl
 create mode 100644 OpenCL/m00620_a0-optimized.cl
 create mode 100644 OpenCL/m00620_a0-pure.cl
 create mode 100644 OpenCL/m00620_a1-optimized.cl
 create mode 100644 OpenCL/m00620_a1-pure.cl
 create mode 100644 OpenCL/m00620_a3-optimized.cl
 create mode 100644 OpenCL/m00620_a3-pure.cl
 create mode 100644 src/modules/module_00610.c
 create mode 100644 src/modules/module_00620.c
 create mode 100644 tools/test_modules/m00610.pm
 create mode 100644 tools/test_modules/m00620.pm

diff --git a/OpenCL/inc_hash_blake2b.cl b/OpenCL/inc_hash_blake2b.cl
index 9df986d1c..6b3bbe950 100644
--- a/OpenCL/inc_hash_blake2b.cl
+++ b/OpenCL/inc_hash_blake2b.cl
@@ -660,6 +660,36 @@ DECLSPEC void blake2b_init_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx)
   ctx->len = 0;
 }
 
+DECLSPEC void blake2b_init_vector_from_scalar(blake2b_ctx_vector_t* ctx, blake2b_ctx_t* ctx0) {
+  ctx->h[0] = ctx0->h[0];
+  ctx->h[1] = ctx0->h[1];
+  ctx->h[2] = ctx0->h[2];
+  ctx->h[3] = ctx0->h[3];
+  ctx->h[4] = ctx0->h[4];
+  ctx->h[5] = ctx0->h[5];
+  ctx->h[6] = ctx0->h[6];
+  ctx->h[7] = ctx0->h[7];
+
+  ctx->m[ 0] = ctx0->m[ 0];
+  ctx->m[ 1] = ctx0->m[ 1];
+  ctx->m[ 2] = ctx0->m[ 2];
+  ctx->m[ 3] = ctx0->m[ 3];
+  ctx->m[ 4] = ctx0->m[ 4];
+  ctx->m[ 5] = ctx0->m[ 5];
+  ctx->m[ 6] = ctx0->m[ 6];
+  ctx->m[ 7] = ctx0->m[ 7];
+  ctx->m[ 8] = ctx0->m[ 8];
+  ctx->m[ 9] = ctx0->m[ 9];
+  ctx->m[10] = ctx0->m[10];
+  ctx->m[11] = ctx0->m[11];
+  ctx->m[12] = ctx0->m[12];
+  ctx->m[13] = ctx0->m[13];
+  ctx->m[14] = ctx0->m[14];
+  ctx->m[15] = ctx0->m[15];
+
+  ctx->len = ctx0->len;
+}
+
 DECLSPEC void blake2b_update_vector_128 (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVATE_AS u32x *w0, PRIVATE_AS u32x *w1, PRIVATE_AS u32x *w2, PRIVATE_AS u32x *w3, PRIVATE_AS u32x *w4, PRIVATE_AS u32x *w5, PRIVATE_AS u32x *w6, PRIVATE_AS u32x *w7, const int len)
 {
   if (len == 0) return;
diff --git a/OpenCL/inc_hash_blake2b.h b/OpenCL/inc_hash_blake2b.h
index b7d6e134c..d4cfb96d4 100644
--- a/OpenCL/inc_hash_blake2b.h
+++ b/OpenCL/inc_hash_blake2b.h
@@ -92,6 +92,7 @@ DECLSPEC void blake2b_final (PRIVATE_AS blake2b_ctx_t *ctx);
 
 DECLSPEC void blake2b_transform_vector (PRIVATE_AS u64x *h, PRIVATE_AS const u64x *m, const u32x len, const u64 f0);
 DECLSPEC void blake2b_init_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx);
+DECLSPEC void blake2b_init_vector_from_scalar(PRIVATE_AS blake2b_ctx_vector_t* ctx, PRIVATE_AS blake2b_ctx_t* ctx0);
 DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
 DECLSPEC void blake2b_final_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx);
 
diff --git a/OpenCL/m00610_a0-optimized.cl b/OpenCL/m00610_a0-optimized.cl
new file mode 100644
index 000000000..01b29a159
--- /dev/null
+++ b/OpenCL/m00610_a0-optimized.cl
@@ -0,0 +1,360 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp_optimized.h)
+#include M2S(INCLUDE_PATH/inc_rp_optimized.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00610_m04 (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };
+
+    const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
+
+    /**
+     * append salt
+     */
+
+    u32x s0[4];
+    u32x s1[4];
+    u32x s2[4];
+    u32x s3[4];
+
+    s0[0] = salt_buf0[0];
+    s0[1] = salt_buf0[1];
+    s0[2] = salt_buf0[2];
+    s0[3] = salt_buf0[3];
+    s1[0] = salt_buf1[0];
+    s1[1] = salt_buf1[1];
+    s1[2] = salt_buf1[2];
+    s1[3] = salt_buf1[3];
+    s2[0] = salt_buf2[0];
+    s2[1] = salt_buf2[1];
+    s2[2] = salt_buf2[2];
+    s2[3] = salt_buf2[3];
+    s3[0] = salt_buf3[0];
+    s3[1] = salt_buf3[1];
+    s3[2] = salt_buf3[2];
+    s3[3] = salt_buf3[3];
+
+    switch_buffer_by_offset_le_VV (s0, s1, s2, s3, out_len);
+
+    const u32x pw_salt_len = out_len + salt_len;
+
+    w0[0] |= s0[0];
+    w0[1] |= s0[1];
+    w0[2] |= s0[2];
+    w0[3] |= s0[3];
+    w1[0] |= s1[0];
+    w1[1] |= s1[1];
+    w1[2] |= s1[2];
+    w1[3] |= s1[3];
+    w2[0] |= s2[0];
+    w2[1] |= s2[1];
+    w2[2] |= s2[2];
+    w2[3] |= s2[3];
+    w3[0] |= s3[0];
+    w3[1] |= s3[1];
+    w3[2] |= s3[2];
+    w3[3] |= s3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_m08 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ void m00610_m16 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ void m00610_s04 (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  /**
+   * salt
+   */
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };
+
+    const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
+
+    /**
+     * append salt
+     */
+
+    u32x s0[4];
+    u32x s1[4];
+    u32x s2[4];
+    u32x s3[4];
+
+    s0[0] = salt_buf0[0];
+    s0[1] = salt_buf0[1];
+    s0[2] = salt_buf0[2];
+    s0[3] = salt_buf0[3];
+    s1[0] = salt_buf1[0];
+    s1[1] = salt_buf1[1];
+    s1[2] = salt_buf1[2];
+    s1[3] = salt_buf1[3];
+    s2[0] = salt_buf2[0];
+    s2[1] = salt_buf2[1];
+    s2[2] = salt_buf2[2];
+    s2[3] = salt_buf2[3];
+    s3[0] = salt_buf3[0];
+    s3[1] = salt_buf3[1];
+    s3[2] = salt_buf3[2];
+    s3[3] = salt_buf3[3];
+
+    switch_buffer_by_offset_le_VV (s0, s1, s2, s3, out_len);
+
+    const u32x pw_salt_len = out_len + salt_len;
+
+    w0[0] |= s0[0];
+    w0[1] |= s0[1];
+    w0[2] |= s0[2];
+    w0[3] |= s0[3];
+    w1[0] |= s1[0];
+    w1[1] |= s1[1];
+    w1[2] |= s1[2];
+    w1[3] |= s1[3];
+    w2[0] |= s2[0];
+    w2[1] |= s2[1];
+    w2[2] |= s2[2];
+    w2[3] |= s2[3];
+    w3[0] |= s3[0];
+    w3[1] |= s3[1];
+    w3[2] |= s3[2];
+    w3[3] |= s3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_s08 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ void m00610_s16 (KERN_ATTR_RULES ())
+{
+}
+
diff --git a/OpenCL/m00610_a0-pure.cl b/OpenCL/m00610_a0-pure.cl
new file mode 100644
index 000000000..f3d98ff9e
--- /dev/null
+++ b/OpenCL/m00610_a0-pure.cl
@@ -0,0 +1,133 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp.h)
+#include M2S(INCLUDE_PATH/inc_rp.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00610_mxx (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    blake2b_ctx_t ctx;
+    blake2b_init (&ctx);
+
+    blake2b_update (&ctx, tmp.i, tmp.pw_len);
+    blake2b_update (&ctx, s, salt_len);
+
+    blake2b_final  (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_sxx (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    blake2b_ctx_t ctx;
+    blake2b_init (&ctx);
+
+    blake2b_update (&ctx, tmp.i, tmp.pw_len);
+    blake2b_update (&ctx, s, salt_len);
+
+    blake2b_final  (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
diff --git a/OpenCL/m00610_a1-optimized.cl b/OpenCL/m00610_a1-optimized.cl
new file mode 100644
index 000000000..e50f6955c
--- /dev/null
+++ b/OpenCL/m00610_a1-optimized.cl
@@ -0,0 +1,478 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00610_m04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_l_len = pws[gid].pw_len & 63;
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
+
+    const u32x pw_len = (pw_l_len + pw_r_len) & 63;
+
+    /**
+     * concat password candidate
+     */
+
+    u32x wordl0[4] = { 0 };
+    u32x wordl1[4] = { 0 };
+    u32x wordl2[4] = { 0 };
+    u32x wordl3[4] = { 0 };
+
+    wordl0[0] = pw_buf0[0];
+    wordl0[1] = pw_buf0[1];
+    wordl0[2] = pw_buf0[2];
+    wordl0[3] = pw_buf0[3];
+    wordl1[0] = pw_buf1[0];
+    wordl1[1] = pw_buf1[1];
+    wordl1[2] = pw_buf1[2];
+    wordl1[3] = pw_buf1[3];
+
+    u32x wordr0[4] = { 0 };
+    u32x wordr1[4] = { 0 };
+    u32x wordr2[4] = { 0 };
+    u32x wordr3[4] = { 0 };
+
+    wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+    wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+    wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+    wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+    wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+    wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+    wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+    wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
+
+    if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT)
+    {
+      switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+    }
+    else
+    {
+      switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
+    }
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = wordl0[0] | wordr0[0];
+    w0[1] = wordl0[1] | wordr0[1];
+    w0[2] = wordl0[2] | wordr0[2];
+    w0[3] = wordl0[3] | wordr0[3];
+    w1[0] = wordl1[0] | wordr1[0];
+    w1[1] = wordl1[1] | wordr1[1];
+    w1[2] = wordl1[2] | wordr1[2];
+    w1[3] = wordl1[3] | wordr1[3];
+    w2[0] = wordl2[0] | wordr2[0];
+    w2[1] = wordl2[1] | wordr2[1];
+    w2[2] = wordl2[2] | wordr2[2];
+    w2[3] = wordl2[3] | wordr2[3];
+    w3[0] = wordl3[0] | wordr3[0];
+    w3[1] = wordl3[1] | wordr3[1];
+    w3[2] = wordl3[2] | wordr3[2];
+    w3[3] = wordl3[3] | wordr3[3];
+
+    /**
+     * append salt
+     */
+
+    u32x s0[4];
+    u32x s1[4];
+    u32x s2[4];
+    u32x s3[4];
+
+    s0[0] = salt_buf0[0];
+    s0[1] = salt_buf0[1];
+    s0[2] = salt_buf0[2];
+    s0[3] = salt_buf0[3];
+    s1[0] = salt_buf1[0];
+    s1[1] = salt_buf1[1];
+    s1[2] = salt_buf1[2];
+    s1[3] = salt_buf1[3];
+    s2[0] = salt_buf2[0];
+    s2[1] = salt_buf2[1];
+    s2[2] = salt_buf2[2];
+    s2[3] = salt_buf2[3];
+    s3[0] = salt_buf3[0];
+    s3[1] = salt_buf3[1];
+    s3[2] = salt_buf3[2];
+    s3[3] = salt_buf3[3];
+
+    switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len);
+
+    const u32x pw_salt_len = pw_len + salt_len;
+
+    w0[0] |= s0[0];
+    w0[1] |= s0[1];
+    w0[2] |= s0[2];
+    w0[3] |= s0[3];
+    w1[0] |= s1[0];
+    w1[1] |= s1[1];
+    w1[2] |= s1[2];
+    w1[3] |= s1[3];
+    w2[0] |= s2[0];
+    w2[1] |= s2[1];
+    w2[2] |= s2[2];
+    w2[3] |= s2[3];
+    w3[0] |= s3[0];
+    w3[1] |= s3[1];
+    w3[2] |= s3[2];
+    w3[3] |= s3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_m08 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ void m00610_m16 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ void m00610_s04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_l_len = pws[gid].pw_len & 63;
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
+
+    const u32x pw_len = (pw_l_len + pw_r_len) & 63;
+
+    /**
+     * concat password candidate
+     */
+
+    u32x wordl0[4] = { 0 };
+    u32x wordl1[4] = { 0 };
+    u32x wordl2[4] = { 0 };
+    u32x wordl3[4] = { 0 };
+
+    wordl0[0] = pw_buf0[0];
+    wordl0[1] = pw_buf0[1];
+    wordl0[2] = pw_buf0[2];
+    wordl0[3] = pw_buf0[3];
+    wordl1[0] = pw_buf1[0];
+    wordl1[1] = pw_buf1[1];
+    wordl1[2] = pw_buf1[2];
+    wordl1[3] = pw_buf1[3];
+
+    u32x wordr0[4] = { 0 };
+    u32x wordr1[4] = { 0 };
+    u32x wordr2[4] = { 0 };
+    u32x wordr3[4] = { 0 };
+
+    wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+    wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+    wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+    wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+    wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+    wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+    wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+    wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
+
+    if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT)
+    {
+      switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+    }
+    else
+    {
+      switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
+    }
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = wordl0[0] | wordr0[0];
+    w0[1] = wordl0[1] | wordr0[1];
+    w0[2] = wordl0[2] | wordr0[2];
+    w0[3] = wordl0[3] | wordr0[3];
+    w1[0] = wordl1[0] | wordr1[0];
+    w1[1] = wordl1[1] | wordr1[1];
+    w1[2] = wordl1[2] | wordr1[2];
+    w1[3] = wordl1[3] | wordr1[3];
+    w2[0] = wordl2[0] | wordr2[0];
+    w2[1] = wordl2[1] | wordr2[1];
+    w2[2] = wordl2[2] | wordr2[2];
+    w2[3] = wordl2[3] | wordr2[3];
+    w3[0] = wordl3[0] | wordr3[0];
+    w3[1] = wordl3[1] | wordr3[1];
+    w3[2] = wordl3[2] | wordr3[2];
+    w3[3] = wordl3[3] | wordr3[3];
+
+    /**
+     * append salt
+     */
+
+    u32x s0[4];
+    u32x s1[4];
+    u32x s2[4];
+    u32x s3[4];
+
+    s0[0] = salt_buf0[0];
+    s0[1] = salt_buf0[1];
+    s0[2] = salt_buf0[2];
+    s0[3] = salt_buf0[3];
+    s1[0] = salt_buf1[0];
+    s1[1] = salt_buf1[1];
+    s1[2] = salt_buf1[2];
+    s1[3] = salt_buf1[3];
+    s2[0] = salt_buf2[0];
+    s2[1] = salt_buf2[1];
+    s2[2] = salt_buf2[2];
+    s2[3] = salt_buf2[3];
+    s3[0] = salt_buf3[0];
+    s3[1] = salt_buf3[1];
+    s3[2] = salt_buf3[2];
+    s3[3] = salt_buf3[3];
+
+    switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len);
+
+    const u32x pw_salt_len = pw_len + salt_len;
+
+    w0[0] |= s0[0];
+    w0[1] |= s0[1];
+    w0[2] |= s0[2];
+    w0[3] |= s0[3];
+    w1[0] |= s1[0];
+    w1[1] |= s1[1];
+    w1[2] |= s1[2];
+    w1[3] |= s1[3];
+    w2[0] |= s2[0];
+    w2[1] |= s2[1];
+    w2[2] |= s2[2];
+    w2[3] |= s2[3];
+    w3[0] |= s3[0];
+    w3[1] |= s3[1];
+    w3[0] |= s3[2];
+    w3[1] |= s3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_s08 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ void m00610_s16 (KERN_ATTR_BASIC ())
+{
+}
+
diff --git a/OpenCL/m00610_a1-pure.cl b/OpenCL/m00610_a1-pure.cl
new file mode 100644
index 000000000..9eeabb060
--- /dev/null
+++ b/OpenCL/m00610_a1-pure.cl
@@ -0,0 +1,131 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00610_mxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    blake2b_ctx_t ctx = ctx0;
+
+    blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    blake2b_update (&ctx, s, salt_len);
+
+    blake2b_final (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_sxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+  /**
+   * base
+   */
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    blake2b_ctx_t ctx = ctx0;
+
+    blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    blake2b_update (&ctx, s, salt_len);
+
+    blake2b_final (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
+
diff --git a/OpenCL/m00610_a3-optimized.cl b/OpenCL/m00610_a3-optimized.cl
new file mode 100644
index 000000000..7a406b40e
--- /dev/null
+++ b/OpenCL/m00610_a3-optimized.cl
@@ -0,0 +1,534 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+DECLSPEC void m00610m (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTOR ())
+{
+  /**
+   * modifiers are taken from args
+   */
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  const u32 pw_salt_len = pw_len + salt_len;
+
+  switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+  /**
+   * loop
+   */
+
+  u32 w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = w0x;
+    w0[1] = w[ 1];
+    w0[2] = w[ 2];
+    w0[3] = w[ 3];
+    w1[0] = w[ 4];
+    w1[1] = w[ 5];
+    w1[2] = w[ 6];
+    w1[3] = w[ 7];
+    w2[0] = w[ 8];
+    w2[1] = w[ 9];
+    w2[2] = w[10];
+    w2[3] = w[11];
+    w3[0] = w[12];
+    w3[1] = w[13];
+    w3[2] = w[14];
+    w3[3] = w[15];
+
+    w0[0] |= salt_buf0[0];
+    w0[1] |= salt_buf0[1];
+    w0[2] |= salt_buf0[2];
+    w0[3] |= salt_buf0[3];
+    w1[0] |= salt_buf1[0];
+    w1[1] |= salt_buf1[1];
+    w1[2] |= salt_buf1[2];
+    w1[3] |= salt_buf1[3];
+    w2[0] |= salt_buf2[0];
+    w2[1] |= salt_buf2[1];
+    w2[2] |= salt_buf2[2];
+    w2[3] |= salt_buf2[3];
+    w3[0] |= salt_buf3[0];
+    w3[1] |= salt_buf3[1];
+    w3[2] |= salt_buf3[2];
+    w3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+DECLSPEC void m00610s (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTOR ())
+{
+  /**
+   * modifiers are taken from args
+   */
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  const u32 pw_salt_len = pw_len + salt_len;
+
+  switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
+  /**
+   * loop
+   */
+
+  u32 w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = w0x;
+    w0[1] = w[ 1];
+    w0[2] = w[ 2];
+    w0[3] = w[ 3];
+    w1[0] = w[ 4];
+    w1[1] = w[ 5];
+    w1[2] = w[ 6];
+    w1[3] = w[ 7];
+    w2[0] = w[ 8];
+    w2[1] = w[ 9];
+    w2[2] = w[10];
+    w2[3] = w[11];
+    w3[0] = w[12];
+    w3[1] = w[13];
+    w3[2] = w[14];
+    w3[3] = w[15];
+
+    w0[0] |= salt_buf0[0];
+    w0[1] |= salt_buf0[1];
+    w0[2] |= salt_buf0[2];
+    w0[3] |= salt_buf0[3];
+    w1[0] |= salt_buf1[0];
+    w1[1] |= salt_buf1[1];
+    w1[2] |= salt_buf1[2];
+    w1[3] |= salt_buf1[3];
+    w2[0] |= salt_buf2[0];
+    w2[1] |= salt_buf2[1];
+    w2[2] |= salt_buf2[2];
+    w2[3] |= salt_buf2[3];
+    w3[0] |= salt_buf3[0];
+    w3[1] |= salt_buf3[1];
+    w3[2] |= salt_buf3[2];
+    w3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_m04 (KERN_ATTR_VECTOR ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w[16];
+
+  w[ 0] = pws[gid].i[ 0];
+  w[ 1] = pws[gid].i[ 1];
+  w[ 2] = pws[gid].i[ 2];
+  w[ 3] = pws[gid].i[ 3];
+  w[ 4] = 0;
+  w[ 5] = 0;
+  w[ 6] = 0;
+  w[ 7] = 0;
+  w[ 8] = 0;
+  w[ 9] = 0;
+  w[10] = 0;
+  w[11] = 0;
+  w[12] = 0;
+  w[13] = 0;
+  w[14] = 0;
+  w[15] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00610m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00610_m08 (KERN_ATTR_VECTOR ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w[16];
+
+  w[ 0] = pws[gid].i[ 0];
+  w[ 1] = pws[gid].i[ 1];
+  w[ 2] = pws[gid].i[ 2];
+  w[ 3] = pws[gid].i[ 3];
+  w[ 4] = pws[gid].i[ 4];
+  w[ 5] = pws[gid].i[ 5];
+  w[ 6] = pws[gid].i[ 6];
+  w[ 7] = pws[gid].i[ 7];
+  w[ 8] = 0;
+  w[ 9] = 0;
+  w[10] = 0;
+  w[11] = 0;
+  w[12] = 0;
+  w[13] = 0;
+  w[14] = 0;
+  w[15] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00610m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00610_m16 (KERN_ATTR_VECTOR ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w[16];
+
+  w[ 0] = pws[gid].i[ 0];
+  w[ 1] = pws[gid].i[ 1];
+  w[ 2] = pws[gid].i[ 2];
+  w[ 3] = pws[gid].i[ 3];
+  w[ 4] = pws[gid].i[ 4];
+  w[ 5] = pws[gid].i[ 5];
+  w[ 6] = pws[gid].i[ 6];
+  w[ 7] = pws[gid].i[ 7];
+  w[ 8] = pws[gid].i[ 8];
+  w[ 9] = pws[gid].i[ 9];
+  w[10] = pws[gid].i[10];
+  w[11] = pws[gid].i[11];
+  w[12] = pws[gid].i[12];
+  w[13] = pws[gid].i[13];
+  w[14] = pws[gid].i[14];
+  w[15] = pws[gid].i[15];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00610m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00610_s04 (KERN_ATTR_VECTOR ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w[16];
+
+  w[ 0] = pws[gid].i[ 0];
+  w[ 1] = pws[gid].i[ 1];
+  w[ 2] = pws[gid].i[ 2];
+  w[ 3] = pws[gid].i[ 3];
+  w[ 4] = 0;
+  w[ 5] = 0;
+  w[ 6] = 0;
+  w[ 7] = 0;
+  w[ 8] = 0;
+  w[ 9] = 0;
+  w[10] = 0;
+  w[11] = 0;
+  w[12] = 0;
+  w[13] = 0;
+  w[14] = 0;
+  w[15] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00610s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00610_s08 (KERN_ATTR_VECTOR ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w[16];
+
+  w[ 0] = pws[gid].i[ 0];
+  w[ 1] = pws[gid].i[ 1];
+  w[ 2] = pws[gid].i[ 2];
+  w[ 3] = pws[gid].i[ 3];
+  w[ 4] = pws[gid].i[ 4];
+  w[ 5] = pws[gid].i[ 5];
+  w[ 6] = pws[gid].i[ 6];
+  w[ 7] = pws[gid].i[ 7];
+  w[ 8] = 0;
+  w[ 9] = 0;
+  w[10] = 0;
+  w[11] = 0;
+  w[12] = 0;
+  w[13] = 0;
+  w[14] = 0;
+  w[15] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00610s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00610_s16 (KERN_ATTR_VECTOR ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w[16];
+
+  w[ 0] = pws[gid].i[ 0];
+  w[ 1] = pws[gid].i[ 1];
+  w[ 2] = pws[gid].i[ 2];
+  w[ 3] = pws[gid].i[ 3];
+  w[ 4] = pws[gid].i[ 4];
+  w[ 5] = pws[gid].i[ 5];
+  w[ 6] = pws[gid].i[ 6];
+  w[ 7] = pws[gid].i[ 7];
+  w[ 8] = pws[gid].i[ 8];
+  w[ 9] = pws[gid].i[ 9];
+  w[10] = pws[gid].i[10];
+  w[11] = pws[gid].i[11];
+  w[12] = pws[gid].i[12];
+  w[13] = pws[gid].i[13];
+  w[14] = pws[gid].i[14];
+  w[15] = pws[gid].i[15];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00610s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
diff --git a/OpenCL/m00610_a3-pure.cl b/OpenCL/m00610_a3-pure.cl
new file mode 100644
index 000000000..d61b6f847
--- /dev/null
+++ b/OpenCL/m00610_a3-pure.cl
@@ -0,0 +1,158 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00610_mxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    blake2b_ctx_vector_t ctx;
+
+    blake2b_init_vector   (&ctx);
+
+    blake2b_update_vector (&ctx, w, pw_len);
+
+    blake2b_update_vector (&ctx, s, salt_len);
+
+    blake2b_final_vector  (&ctx);
+
+    const u32x r0 = h32_from_64 (ctx.h[0]);
+    const u32x r1 = l32_from_64 (ctx.h[0]);
+    const u32x r2 = h32_from_64 (ctx.h[1]);
+    const u32x r3 = l32_from_64 (ctx.h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00610_sxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    blake2b_ctx_vector_t ctx;
+
+    blake2b_init_vector   (&ctx);
+
+    blake2b_update_vector (&ctx, w, pw_len);
+
+    blake2b_update_vector (&ctx, s, salt_len);
+
+    blake2b_final_vector  (&ctx);
+
+    const u32x r0 = h32_from_64 (ctx.h[0]);
+    const u32x r1 = l32_from_64 (ctx.h[0]);
+    const u32x r2 = h32_from_64 (ctx.h[1]);
+    const u32x r3 = l32_from_64 (ctx.h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
diff --git a/OpenCL/m00620_a0-optimized.cl b/OpenCL/m00620_a0-optimized.cl
new file mode 100644
index 000000000..29144dc4a
--- /dev/null
+++ b/OpenCL/m00620_a0-optimized.cl
@@ -0,0 +1,316 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp_optimized.h)
+#include M2S(INCLUDE_PATH/inc_rp_optimized.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00620_m04 (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };
+
+    const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
+
+    /**
+     * prepend salt
+     */
+
+    const u32x out_salt_len = out_len + salt_len;
+
+    switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
+
+    w0[0] |= salt_buf0[0];
+    w0[1] |= salt_buf0[1];
+    w0[2] |= salt_buf0[2];
+    w0[3] |= salt_buf0[3];
+    w1[0] |= salt_buf1[0];
+    w1[1] |= salt_buf1[1];
+    w1[2] |= salt_buf1[2];
+    w1[3] |= salt_buf1[3];
+    w2[0] |= salt_buf2[0];
+    w2[1] |= salt_buf2[1];
+    w2[2] |= salt_buf2[2];
+    w2[3] |= salt_buf2[3];
+    w3[0] |= salt_buf3[0];
+    w3[1] |= salt_buf3[1];
+    w3[2] |= salt_buf3[2];
+    w3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, out_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_m08 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ void m00620_m16 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ void m00620_s04 (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  /**
+   * salt
+   */
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };
+
+    const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
+
+    /**
+     * prepend salt
+     */
+
+    const u32x out_salt_len = out_len + salt_len;
+
+    switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
+
+    w0[0] |= salt_buf0[0];
+    w0[1] |= salt_buf0[1];
+    w0[2] |= salt_buf0[2];
+    w0[3] |= salt_buf0[3];
+    w1[0] |= salt_buf1[0];
+    w1[1] |= salt_buf1[1];
+    w1[2] |= salt_buf1[2];
+    w1[3] |= salt_buf1[3];
+    w2[0] |= salt_buf2[0];
+    w2[1] |= salt_buf2[1];
+    w2[2] |= salt_buf2[2];
+    w2[3] |= salt_buf2[3];
+    w3[0] |= salt_buf3[0];
+    w3[1] |= salt_buf3[1];
+    w3[2] |= salt_buf3[2];
+    w3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, out_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_s08 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ void m00620_s16 (KERN_ATTR_RULES ())
+{
+}
+
diff --git a/OpenCL/m00620_a0-pure.cl b/OpenCL/m00620_a0-pure.cl
new file mode 100644
index 000000000..bcc056611
--- /dev/null
+++ b/OpenCL/m00620_a0-pure.cl
@@ -0,0 +1,121 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp.h)
+#include M2S(INCLUDE_PATH/inc_rp.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00620_mxx (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    blake2b_ctx_t ctx = ctx0;
+
+    blake2b_update (&ctx, tmp.i, tmp.pw_len);
+    blake2b_final  (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_sxx (KERN_ATTR_RULES ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    blake2b_ctx_t ctx = ctx0;
+
+    blake2b_update (&ctx, tmp.i, tmp.pw_len);
+    blake2b_final  (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
diff --git a/OpenCL/m00620_a1-optimized.cl b/OpenCL/m00620_a1-optimized.cl
new file mode 100644
index 000000000..fa50e5d89
--- /dev/null
+++ b/OpenCL/m00620_a1-optimized.cl
@@ -0,0 +1,434 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00620_m04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_l_len = pws[gid].pw_len & 63;
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
+
+    const u32x pw_len = (pw_l_len + pw_r_len) & 63;
+
+    /**
+     * concat password candidate
+     */
+
+    u32x wordl0[4] = { 0 };
+    u32x wordl1[4] = { 0 };
+    u32x wordl2[4] = { 0 };
+    u32x wordl3[4] = { 0 };
+
+    wordl0[0] = pw_buf0[0];
+    wordl0[1] = pw_buf0[1];
+    wordl0[2] = pw_buf0[2];
+    wordl0[3] = pw_buf0[3];
+    wordl1[0] = pw_buf1[0];
+    wordl1[1] = pw_buf1[1];
+    wordl1[2] = pw_buf1[2];
+    wordl1[3] = pw_buf1[3];
+
+    u32x wordr0[4] = { 0 };
+    u32x wordr1[4] = { 0 };
+    u32x wordr2[4] = { 0 };
+    u32x wordr3[4] = { 0 };
+
+    wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+    wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+    wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+    wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+    wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+    wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+    wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+    wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
+
+    if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT)
+    {
+      switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+    }
+    else
+    {
+      switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
+    }
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = wordl0[0] | wordr0[0];
+    w0[1] = wordl0[1] | wordr0[1];
+    w0[2] = wordl0[2] | wordr0[2];
+    w0[3] = wordl0[3] | wordr0[3];
+    w1[0] = wordl1[0] | wordr1[0];
+    w1[1] = wordl1[1] | wordr1[1];
+    w1[2] = wordl1[2] | wordr1[2];
+    w1[3] = wordl1[3] | wordr1[3];
+    w2[0] = wordl2[0] | wordr2[0];
+    w2[1] = wordl2[1] | wordr2[1];
+    w2[2] = wordl2[2] | wordr2[2];
+    w2[3] = wordl2[3] | wordr2[3];
+    w3[0] = wordl3[0] | wordr3[0];
+    w3[1] = wordl3[1] | wordr3[1];
+    w3[2] = wordl3[2] | wordr3[2];
+    w3[3] = wordl3[3] | wordr3[3];
+
+    /**
+     * prepend salt
+     */
+
+    switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
+
+    const u32x pw_salt_len = pw_len + salt_len;
+
+    w0[0] |= salt_buf0[0];
+    w0[1] |= salt_buf0[1];
+    w0[2] |= salt_buf0[2];
+    w0[3] |= salt_buf0[3];
+    w1[0] |= salt_buf1[0];
+    w1[1] |= salt_buf1[1];
+    w1[2] |= salt_buf1[2];
+    w1[3] |= salt_buf1[3];
+    w2[0] |= salt_buf2[0];
+    w2[1] |= salt_buf2[1];
+    w2[2] |= salt_buf2[2];
+    w2[3] |= salt_buf2[3];
+    w3[0] |= salt_buf3[0];
+    w3[1] |= salt_buf3[1];
+    w3[2] |= salt_buf3[2];
+    w3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_m08 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ void m00620_m16 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ void m00620_s04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_l_len = pws[gid].pw_len & 63;
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
+
+    const u32x pw_len = (pw_l_len + pw_r_len) & 63;
+
+    /**
+     * concat password candidate
+     */
+
+    u32x wordl0[4] = { 0 };
+    u32x wordl1[4] = { 0 };
+    u32x wordl2[4] = { 0 };
+    u32x wordl3[4] = { 0 };
+
+    wordl0[0] = pw_buf0[0];
+    wordl0[1] = pw_buf0[1];
+    wordl0[2] = pw_buf0[2];
+    wordl0[3] = pw_buf0[3];
+    wordl1[0] = pw_buf1[0];
+    wordl1[1] = pw_buf1[1];
+    wordl1[2] = pw_buf1[2];
+    wordl1[3] = pw_buf1[3];
+
+    u32x wordr0[4] = { 0 };
+    u32x wordr1[4] = { 0 };
+    u32x wordr2[4] = { 0 };
+    u32x wordr3[4] = { 0 };
+
+    wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+    wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+    wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+    wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+    wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+    wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+    wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+    wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
+
+    if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT)
+    {
+      switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+    }
+    else
+    {
+      switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
+    }
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = wordl0[0] | wordr0[0];
+    w0[1] = wordl0[1] | wordr0[1];
+    w0[2] = wordl0[2] | wordr0[2];
+    w0[3] = wordl0[3] | wordr0[3];
+    w1[0] = wordl1[0] | wordr1[0];
+    w1[1] = wordl1[1] | wordr1[1];
+    w1[2] = wordl1[2] | wordr1[2];
+    w1[3] = wordl1[3] | wordr1[3];
+    w2[0] = wordl2[0] | wordr2[0];
+    w2[1] = wordl2[1] | wordr2[1];
+    w2[2] = wordl2[2] | wordr2[2];
+    w2[3] = wordl2[3] | wordr2[3];
+    w3[0] = wordl3[0] | wordr3[0];
+    w3[1] = wordl3[1] | wordr3[1];
+    w3[2] = wordl3[2] | wordr3[2];
+    w3[3] = wordl3[3] | wordr3[3];
+
+    /**
+     * prepend salt
+     */
+
+    switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
+
+    const u32x pw_salt_len = pw_len + salt_len;
+
+    w0[0] |= salt_buf0[0];
+    w0[1] |= salt_buf0[1];
+    w0[2] |= salt_buf0[2];
+    w0[3] |= salt_buf0[3];
+    w1[0] |= salt_buf1[0];
+    w1[1] |= salt_buf1[1];
+    w1[2] |= salt_buf1[2];
+    w1[3] |= salt_buf1[3];
+    w2[0] |= salt_buf2[0];
+    w2[1] |= salt_buf2[1];
+    w2[2] |= salt_buf2[2];
+    w2[3] |= salt_buf2[3];
+    w3[0] |= salt_buf3[0];
+    w3[1] |= salt_buf3[1];
+    w3[2] |= salt_buf3[2];
+    w3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_s08 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ void m00620_s16 (KERN_ATTR_BASIC ())
+{
+}
+
diff --git a/OpenCL/m00620_a1-pure.cl b/OpenCL/m00620_a1-pure.cl
new file mode 100644
index 000000000..70ffccc8c
--- /dev/null
+++ b/OpenCL/m00620_a1-pure.cl
@@ -0,0 +1,114 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00620_mxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len);
+
+  blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    blake2b_ctx_t ctx = ctx0;
+
+    blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    blake2b_final (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_sxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len);
+
+  blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    blake2b_ctx_t ctx = ctx0;
+
+    blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    blake2b_final (&ctx);
+
+    const u32 r0 = h32_from_64_S (ctx.h[0]);
+    const u32 r1 = l32_from_64_S (ctx.h[0]);
+    const u32 r2 = h32_from_64_S (ctx.h[1]);
+    const u32 r3 = l32_from_64_S (ctx.h[1]);
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
+
diff --git a/OpenCL/m00620_a3-optimized.cl b/OpenCL/m00620_a3-optimized.cl
new file mode 100644
index 000000000..64cb352c5
--- /dev/null
+++ b/OpenCL/m00620_a3-optimized.cl
@@ -0,0 +1,590 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+DECLSPEC void m00620m (PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const u32 pw_len, KERN_ATTR_FUNC_BASIC ())
+{
+  /**
+   * modifiers are taken from args
+   */
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  const u32 pw_salt_len = pw_len + salt_len;
+
+  /**
+   * loop
+   */
+
+  const u32 w0l = w0[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = ix_create_bft (bfs_buf, il_pos);
+
+    const u32x w0lr = w0l | w0r;
+
+    u32x t0[4];
+    u32x t1[4];
+    u32x t2[4];
+    u32x t3[4];
+
+    t0[0] = w0lr;
+    t0[1] = w0[1];
+    t0[2] = w0[2];
+    t0[3] = w0[3];
+    t1[0] = w1[0];
+    t1[1] = w1[1];
+    t1[2] = w1[2];
+    t1[3] = w1[3];
+    t2[0] = w2[0];
+    t2[1] = w2[1];
+    t2[2] = w2[2];
+    t2[3] = w2[3];
+    t3[0] = w3[0];
+    t3[1] = w3[1];
+    t3[2] = w3[2];
+    t3[3] = w3[3];
+
+    switch_buffer_by_offset_le (t0, t1, t2, t3, salt_len);
+
+    t0[0] |= salt_buf0[0];
+    t0[1] |= salt_buf0[1];
+    t0[2] |= salt_buf0[2];
+    t0[3] |= salt_buf0[3];
+    t1[0] |= salt_buf1[0];
+    t1[1] |= salt_buf1[1];
+    t1[2] |= salt_buf1[2];
+    t1[3] |= salt_buf1[3];
+    t2[0] |= salt_buf2[0];
+    t2[1] |= salt_buf2[1];
+    t2[2] |= salt_buf2[2];
+    t2[3] |= salt_buf2[3];
+    t3[0] |= salt_buf3[0];
+    t3[1] |= salt_buf3[1];
+    t3[2] |= salt_buf3[2];
+    t3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (t0[1], t0[0]);
+    m[ 1] = hl32_to_64 (t0[3], t0[2]);
+    m[ 2] = hl32_to_64 (t1[1], t1[0]);
+    m[ 3] = hl32_to_64 (t1[3], t1[2]);
+    m[ 4] = hl32_to_64 (t2[1], t2[0]);
+    m[ 5] = hl32_to_64 (t2[3], t2[2]);
+    m[ 6] = hl32_to_64 (t3[1], t3[0]);
+    m[ 7] = hl32_to_64 (t3[3], t3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+DECLSPEC void m00620s (PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const u32 pw_len, KERN_ATTR_FUNC_BASIC ())
+{
+  /**
+   * modifiers are taken from args
+   */
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  const u32 pw_salt_len = pw_len + salt_len;
+
+  /**
+   * loop
+   */
+
+  const u32 w0l = w0[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = ix_create_bft (bfs_buf, il_pos);
+
+    const u32x w0lr = w0l | w0r;
+
+    u32x t0[4];
+    u32x t1[4];
+    u32x t2[4];
+    u32x t3[4];
+
+    t0[0] = w0lr;
+    t0[1] = w0[1];
+    t0[2] = w0[2];
+    t0[3] = w0[3];
+    t1[0] = w1[0];
+    t1[1] = w1[1];
+    t1[2] = w1[2];
+    t1[3] = w1[3];
+    t2[0] = w2[0];
+    t2[1] = w2[1];
+    t2[2] = w2[2];
+    t2[3] = w2[3];
+    t3[0] = w3[0];
+    t3[1] = w3[1];
+    t3[2] = w3[2];
+    t3[3] = w3[3];
+
+    switch_buffer_by_offset_le (t0, t1, t2, t3, salt_len);
+
+    t0[0] |= salt_buf0[0];
+    t0[1] |= salt_buf0[1];
+    t0[2] |= salt_buf0[2];
+    t0[3] |= salt_buf0[3];
+    t1[0] |= salt_buf1[0];
+    t1[1] |= salt_buf1[1];
+    t1[2] |= salt_buf1[2];
+    t1[3] |= salt_buf1[3];
+    t2[0] |= salt_buf2[0];
+    t2[1] |= salt_buf2[1];
+    t2[2] |= salt_buf2[2];
+    t2[3] |= salt_buf2[3];
+    t3[0] |= salt_buf3[0];
+    t3[1] |= salt_buf3[1];
+    t3[2] |= salt_buf3[2];
+    t3[3] |= salt_buf3[3];
+
+    /**
+     * blake2b
+     */
+
+    u64x m[16];
+
+    m[ 0] = hl32_to_64 (t0[1], t0[0]);
+    m[ 1] = hl32_to_64 (t0[3], t0[2]);
+    m[ 2] = hl32_to_64 (t1[1], t1[0]);
+    m[ 3] = hl32_to_64 (t1[3], t1[2]);
+    m[ 4] = hl32_to_64 (t2[1], t2[0]);
+    m[ 5] = hl32_to_64 (t2[3], t2[2]);
+    m[ 6] = hl32_to_64 (t3[1], t3[0]);
+    m[ 7] = hl32_to_64 (t3[3], t3[2]);
+    m[ 8] = 0;
+    m[ 9] = 0;
+    m[10] = 0;
+    m[11] = 0;
+    m[12] = 0;
+    m[13] = 0;
+    m[14] = 0;
+    m[15] = 0;
+
+    u64x h[8];
+
+    h[0] = BLAKE2B_IV_00 ^ 0x01010040;
+    h[1] = BLAKE2B_IV_01;
+    h[2] = BLAKE2B_IV_02;
+    h[3] = BLAKE2B_IV_03;
+    h[4] = BLAKE2B_IV_04;
+    h[5] = BLAKE2B_IV_05;
+    h[6] = BLAKE2B_IV_06;
+    h[7] = BLAKE2B_IV_07;
+
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+
+    const u32x r0 = h32_from_64 (h[0]);
+    const u32x r1 = l32_from_64 (h[0]);
+    const u32x r2 = h32_from_64 (h[1]);
+    const u32x r3 = l32_from_64 (h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_m04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = 0;
+  w1[1] = 0;
+  w1[2] = 0;
+  w1[3] = 0;
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00620m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00620_m08 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00620m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00620_m16 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = pws[gid].i[ 8];
+  w2[1] = pws[gid].i[ 9];
+  w2[2] = pws[gid].i[10];
+  w2[3] = pws[gid].i[11];
+
+  u32 w3[4];
+
+  w3[0] = pws[gid].i[12];
+  w3[1] = pws[gid].i[13];
+  w3[2] = pws[gid].i[14];
+  w3[3] = pws[gid].i[15];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00620m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00620_s04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = 0;
+  w1[1] = 0;
+  w1[2] = 0;
+  w1[3] = 0;
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00620s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00620_s08 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00620s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ void m00620_s16 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = pws[gid].i[ 8];
+  w2[1] = pws[gid].i[ 9];
+  w2[2] = pws[gid].i[10];
+  w2[3] = pws[gid].i[11];
+
+  u32 w3[4];
+
+  w3[0] = pws[gid].i[12];
+  w3[1] = pws[gid].i[13];
+  w3[2] = pws[gid].i[14];
+  w3[3] = pws[gid].i[15];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m00620s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
diff --git a/OpenCL/m00620_a3-pure.cl b/OpenCL/m00620_a3-pure.cl
new file mode 100644
index 000000000..2b34cd3ba
--- /dev/null
+++ b/OpenCL/m00620_a3-pure.cl
@@ -0,0 +1,148 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#endif
+
+KERNEL_FQ void m00620_mxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len);
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    blake2b_ctx_vector_t ctx;
+
+    blake2b_init_vector_from_scalar   (&ctx, &ctx0);
+
+    blake2b_update_vector (&ctx, w, pw_len);
+
+    blake2b_final_vector  (&ctx);
+
+    const u32x r0 = h32_from_64 (ctx.h[0]);
+    const u32x r1 = l32_from_64 (ctx.h[0]);
+    const u32x r2 = h32_from_64 (ctx.h[1]);
+    const u32x r3 = l32_from_64 (ctx.h[1]);
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ void m00620_sxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  blake2b_ctx_t ctx0;
+
+  blake2b_init (&ctx0);
+
+  blake2b_update_global (&ctx0, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len);
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    blake2b_ctx_vector_t ctx;
+
+    blake2b_init_vector_from_scalar   (&ctx, &ctx0);
+
+    blake2b_update_vector (&ctx, w, pw_len);
+
+    blake2b_final_vector  (&ctx);
+
+    const u32x r0 = h32_from_64 (ctx.h[0]);
+    const u32x r1 = l32_from_64 (ctx.h[0]);
+    const u32x r2 = h32_from_64 (ctx.h[1]);
+    const u32x r3 = l32_from_64 (ctx.h[1]);
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
+
diff --git a/docs/changes.txt b/docs/changes.txt
index 383845c34..d6e4b7448 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -4,6 +4,8 @@
 ## Algorithms
 ##
 
+- Added hash-mode: BLAKE2b-512($salt.$pass)
+- Added hash-mode: BLAKE2b-512($pass.$salt)
 - Added hash-mode: Amazon AWS4-HMAC-SHA256
 - Added hash-mode: DPAPI masterkey file v1 (context 3)
 - Added hash-mode: DPAPI masterkey file v2 (context 3)
diff --git a/docs/readme.txt b/docs/readme.txt
index 6cee08f1a..ec40b8ad9 100644
--- a/docs/readme.txt
+++ b/docs/readme.txt
@@ -124,6 +124,8 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or
 - sha512($salt.$pass)
 - sha512($salt.utf16le($pass))
 - sha512(utf16le($pass).$salt)
+- BLAKE2b-512($pass.$salt)
+- BLAKE2b-512($salt.$pass)
 - HMAC-MD5 (key = $pass)
 - HMAC-MD5 (key = $salt)
 - HMAC-SHA1 (key = $pass)
diff --git a/src/modules/module_00610.c b/src/modules/module_00610.c
new file mode 100644
index 000000000..6dd11a126
--- /dev/null
+++ b/src/modules/module_00610.c
@@ -0,0 +1,221 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+
+static const u32   ATTACK_EXEC    = ATTACK_EXEC_INSIDE_KERNEL;
+static const u32   DGST_POS0      = 1;
+static const u32   DGST_POS1      = 0;
+static const u32   DGST_POS2      = 3;
+static const u32   DGST_POS3      = 2;
+static const u32   DGST_SIZE      = DGST_SIZE_8_8;
+static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
+static const char *HASH_NAME      = "BLAKE2b-512($pass.$salt)";
+static const u64   KERN_TYPE      = 610;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_APPENDED_SALT
+                                  | OPTI_TYPE_USES_BITS_64
+                                  | OPTI_TYPE_RAW_HASH;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
+static const u32   SALT_TYPE      = SALT_TYPE_GENERIC;
+static const char *ST_PASS        = "hashcat";
+static const char *ST_HASH        = "$BLAKE2$41fcd44c789c735c08b43a871b81c8f617ca43918d38aee6cf8291c58a0b00a03115857425e5ff6f044be7a5bec8536b52d6c9992e21cd43cdca8a55bbf1f5c1:1033";
+
+u32         module_attack_exec    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ATTACK_EXEC;     }
+u32         module_dgst_pos0      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS0;       }
+u32         module_dgst_pos1      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS1;       }
+u32         module_dgst_pos2      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS2;       }
+u32         module_dgst_pos3      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS3;       }
+u32         module_dgst_size      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_SIZE;       }
+u32         module_hash_category  (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_CATEGORY;   }
+const char *module_hash_name      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_NAME;       }
+u64         module_kern_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return KERN_TYPE;       }
+u32         module_opti_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTI_TYPE;       }
+u64         module_opts_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTS_TYPE;       }
+u32         module_salt_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return SALT_TYPE;       }
+const char *module_st_hash        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH;         }
+const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS;         }
+
+static const char *SIGNATURE_BLAKE2B = "$BLAKE2$";
+
+int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
+{
+  u64 *digest = (u64 *) digest_buf;
+
+  hc_token_t token;
+
+  token.token_cnt = 3;
+
+  // signature
+  token.signatures_cnt    = 1;
+  token.signatures_buf[0] = SIGNATURE_BLAKE2B;
+
+  token.len[0]  = 8;
+  token.attr[0] = TOKEN_ATTR_FIXED_LENGTH
+                | TOKEN_ATTR_VERIFY_SIGNATURE;
+
+  // hash
+  token.sep[1]     = hashconfig->separator;
+  token.len_min[1] = 128;
+  token.len_max[1] = 128;
+  token.attr[1]    = TOKEN_ATTR_VERIFY_LENGTH
+                   | TOKEN_ATTR_VERIFY_HEX;
+
+  // salt
+  token.len_min[2] = SALT_MIN;
+  token.len_max[2] = SALT_MAX;
+  token.attr[2]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  if (hashconfig->opts_type & OPTS_TYPE_ST_HEX) {
+    token.len_min[2] *= 2;
+    token.len_max[2] *= 2;
+
+    token.attr[2] |= TOKEN_ATTR_VERIFY_HEX;
+  }
+
+  const int rc_tokenizer = input_tokenizer ((const u8 *) line_buf, line_len, &token);
+
+  if (rc_tokenizer != PARSER_OK) return (rc_tokenizer);
+
+  const u8 *hash_pos = token.buf[1];
+
+  digest[0] = hex_to_u64 (hash_pos +   0);
+  digest[1] = hex_to_u64 (hash_pos +  16);
+  digest[2] = hex_to_u64 (hash_pos +  32);
+  digest[3] = hex_to_u64 (hash_pos +  48);
+  digest[4] = hex_to_u64 (hash_pos +  64);
+  digest[5] = hex_to_u64 (hash_pos +  80);
+  digest[6] = hex_to_u64 (hash_pos +  96);
+  digest[7] = hex_to_u64 (hash_pos + 112);
+
+  // process salt
+
+  const u8 *salt_pos = token.buf[2];
+  const int salt_len = token.len[2];
+
+  const bool parse_rc = generic_salt_decode (hashconfig, salt_pos, salt_len, (u8 *) salt->salt_buf, (int *) &salt->salt_len);
+
+  if (parse_rc == false) return (PARSER_SALT_LENGTH);
+
+  return (PARSER_OK);
+}
+
+int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const void *digest_buf, MAYBE_UNUSED const salt_t *salt, MAYBE_UNUSED const void *esalt_buf, MAYBE_UNUSED const void *hook_salt_buf, MAYBE_UNUSED const hashinfo_t *hash_info, char *line_buf, MAYBE_UNUSED const int line_size)
+{
+  const u64 *digest = (const u64 *) digest_buf;
+
+  // we can not change anything in the original buffer, otherwise destroying sorting
+  // therefore create some local buffer
+
+  u8 *out_buf = (u8 *) line_buf;
+
+  int out_len = strlen (SIGNATURE_BLAKE2B);
+
+  // signature
+  memcpy (out_buf, SIGNATURE_BLAKE2B, out_len);
+
+  // hash
+  u64_to_hex (digest[0], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[1], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[2], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[3], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[4], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[5], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[6], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[7], out_buf + out_len); out_len += 16;
+
+  // :
+  out_buf[out_len] = hashconfig->separator;
+  out_len += 1;
+
+  // salt
+  out_len += generic_salt_encode (hashconfig, (const u8 *) salt->salt_buf, (const int) salt->salt_len, out_buf + out_len);
+
+  return out_len;
+}
+
+void module_init (module_ctx_t *module_ctx)
+{
+  module_ctx->module_context_size             = MODULE_CONTEXT_SIZE_CURRENT;
+  module_ctx->module_interface_version        = MODULE_INTERFACE_VERSION_CURRENT;
+
+  module_ctx->module_attack_exec              = module_attack_exec;
+  module_ctx->module_benchmark_esalt          = MODULE_DEFAULT;
+  module_ctx->module_benchmark_hook_salt      = MODULE_DEFAULT;
+  module_ctx->module_benchmark_mask           = MODULE_DEFAULT;
+  module_ctx->module_benchmark_salt           = MODULE_DEFAULT;
+  module_ctx->module_build_plain_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_deep_comp_kernel         = MODULE_DEFAULT;
+  module_ctx->module_deprecated_notice        = MODULE_DEFAULT;
+  module_ctx->module_dgst_pos0                = module_dgst_pos0;
+  module_ctx->module_dgst_pos1                = module_dgst_pos1;
+  module_ctx->module_dgst_pos2                = module_dgst_pos2;
+  module_ctx->module_dgst_pos3                = module_dgst_pos3;
+  module_ctx->module_dgst_size                = module_dgst_size;
+  module_ctx->module_dictstat_disable         = MODULE_DEFAULT;
+  module_ctx->module_esalt_size               = MODULE_DEFAULT;
+  module_ctx->module_extra_buffer_size        = MODULE_DEFAULT;
+  module_ctx->module_extra_tmp_size           = MODULE_DEFAULT;
+  module_ctx->module_extra_tuningdb_block     = MODULE_DEFAULT;
+  module_ctx->module_forced_outfile_format    = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_count        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_parse        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_save         = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_zero_hash    = MODULE_DEFAULT;
+  module_ctx->module_hash_decode              = module_hash_decode;
+  module_ctx->module_hash_encode_status       = MODULE_DEFAULT;
+  module_ctx->module_hash_encode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_encode              = module_hash_encode;
+  module_ctx->module_hash_init_selftest       = MODULE_DEFAULT;
+  module_ctx->module_hash_mode                = MODULE_DEFAULT;
+  module_ctx->module_hash_category            = module_hash_category;
+  module_ctx->module_hash_name                = module_hash_name;
+  module_ctx->module_hashes_count_min         = MODULE_DEFAULT;
+  module_ctx->module_hashes_count_max         = MODULE_DEFAULT;
+  module_ctx->module_hlfmt_disable            = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_size    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_init    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_term    = MODULE_DEFAULT;
+  module_ctx->module_hook12                   = MODULE_DEFAULT;
+  module_ctx->module_hook23                   = MODULE_DEFAULT;
+  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
+  module_ctx->module_hook_size                = MODULE_DEFAULT;
+  module_ctx->module_jit_build_options        = MODULE_DEFAULT;
+  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kern_type                = module_kern_type;
+  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
+  module_ctx->module_opti_type                = module_opti_type;
+  module_ctx->module_opts_type                = module_opts_type;
+  module_ctx->module_outfile_check_disable    = MODULE_DEFAULT;
+  module_ctx->module_outfile_check_nocomp     = MODULE_DEFAULT;
+  module_ctx->module_potfile_custom_check     = MODULE_DEFAULT;
+  module_ctx->module_potfile_disable          = MODULE_DEFAULT;
+  module_ctx->module_potfile_keep_all_hashes  = MODULE_DEFAULT;
+  module_ctx->module_pwdump_column            = MODULE_DEFAULT;
+  module_ctx->module_pw_max                   = MODULE_DEFAULT;
+  module_ctx->module_pw_min                   = MODULE_DEFAULT;
+  module_ctx->module_salt_max                 = MODULE_DEFAULT;
+  module_ctx->module_salt_min                 = MODULE_DEFAULT;
+  module_ctx->module_salt_type                = module_salt_type;
+  module_ctx->module_separator                = MODULE_DEFAULT;
+  module_ctx->module_st_hash                  = module_st_hash;
+  module_ctx->module_st_pass                  = module_st_pass;
+  module_ctx->module_tmp_size                 = MODULE_DEFAULT;
+  module_ctx->module_unstable_warning         = MODULE_DEFAULT;
+  module_ctx->module_warmup_disable           = MODULE_DEFAULT;
+}
diff --git a/src/modules/module_00620.c b/src/modules/module_00620.c
new file mode 100644
index 000000000..52bffc26e
--- /dev/null
+++ b/src/modules/module_00620.c
@@ -0,0 +1,221 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+
+static const u32   ATTACK_EXEC    = ATTACK_EXEC_INSIDE_KERNEL;
+static const u32   DGST_POS0      = 1;
+static const u32   DGST_POS1      = 0;
+static const u32   DGST_POS2      = 3;
+static const u32   DGST_POS3      = 2;
+static const u32   DGST_SIZE      = DGST_SIZE_8_8;
+static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
+static const char *HASH_NAME      = "BLAKE2b-512($salt.$pass)";
+static const u64   KERN_TYPE      = 620;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_PREPENDED_SALT
+                                  | OPTI_TYPE_USES_BITS_64
+                                  | OPTI_TYPE_RAW_HASH;
+static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE;
+static const u32   SALT_TYPE      = SALT_TYPE_GENERIC;
+static const char *ST_PASS        = "hashcat";
+static const char *ST_HASH        = "$BLAKE2$f0325fdfc3f82a014935442f7adbc069d4636d67276a85b09f8de368f122cf5195a0b780d7fee709fbf1dcd02ddcb581df84508cf1fb0f3393af1be0565491c6:3301";
+
+u32         module_attack_exec    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ATTACK_EXEC;     }
+u32         module_dgst_pos0      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS0;       }
+u32         module_dgst_pos1      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS1;       }
+u32         module_dgst_pos2      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS2;       }
+u32         module_dgst_pos3      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS3;       }
+u32         module_dgst_size      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_SIZE;       }
+u32         module_hash_category  (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_CATEGORY;   }
+const char *module_hash_name      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_NAME;       }
+u64         module_kern_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return KERN_TYPE;       }
+u32         module_opti_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTI_TYPE;       }
+u64         module_opts_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTS_TYPE;       }
+u32         module_salt_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return SALT_TYPE;       }
+const char *module_st_hash        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH;         }
+const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS;         }
+
+static const char *SIGNATURE_BLAKE2B = "$BLAKE2$";
+
+int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
+{
+  u64 *digest = (u64 *) digest_buf;
+
+  hc_token_t token;
+
+  token.token_cnt = 3;
+
+  // signature
+  token.signatures_cnt    = 1;
+  token.signatures_buf[0] = SIGNATURE_BLAKE2B;
+
+  token.len[0]  = 8;
+  token.attr[0] = TOKEN_ATTR_FIXED_LENGTH
+                | TOKEN_ATTR_VERIFY_SIGNATURE;
+
+  // hash
+  token.sep[1]     = hashconfig->separator;
+  token.len_min[1] = 128;
+  token.len_max[1] = 128;
+  token.attr[1]    = TOKEN_ATTR_VERIFY_LENGTH
+                   | TOKEN_ATTR_VERIFY_HEX;
+
+  // salt
+  token.len_min[2] = SALT_MIN;
+  token.len_max[2] = SALT_MAX;
+  token.attr[2]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  if (hashconfig->opts_type & OPTS_TYPE_ST_HEX) {
+    token.len_min[2] *= 2;
+    token.len_max[2] *= 2;
+
+    token.attr[2] |= TOKEN_ATTR_VERIFY_HEX;
+  }
+
+  const int rc_tokenizer = input_tokenizer ((const u8 *) line_buf, line_len, &token);
+
+  if (rc_tokenizer != PARSER_OK) return (rc_tokenizer);
+
+  const u8 *hash_pos = token.buf[1];
+
+  digest[0] = hex_to_u64 (hash_pos +   0);
+  digest[1] = hex_to_u64 (hash_pos +  16);
+  digest[2] = hex_to_u64 (hash_pos +  32);
+  digest[3] = hex_to_u64 (hash_pos +  48);
+  digest[4] = hex_to_u64 (hash_pos +  64);
+  digest[5] = hex_to_u64 (hash_pos +  80);
+  digest[6] = hex_to_u64 (hash_pos +  96);
+  digest[7] = hex_to_u64 (hash_pos + 112);
+
+  // process salt
+
+  const u8 *salt_pos = token.buf[2];
+  const int salt_len = token.len[2];
+
+  const bool parse_rc = generic_salt_decode (hashconfig, salt_pos, salt_len, (u8 *) salt->salt_buf, (int *) &salt->salt_len);
+
+  if (parse_rc == false) return (PARSER_SALT_LENGTH);
+
+  return (PARSER_OK);
+}
+
+int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const void *digest_buf, MAYBE_UNUSED const salt_t *salt, MAYBE_UNUSED const void *esalt_buf, MAYBE_UNUSED const void *hook_salt_buf, MAYBE_UNUSED const hashinfo_t *hash_info, char *line_buf, MAYBE_UNUSED const int line_size)
+{
+  const u64 *digest = (const u64 *) digest_buf;
+
+  // we can not change anything in the original buffer, otherwise destroying sorting
+  // therefore create some local buffer
+
+  u8 *out_buf = (u8 *) line_buf;
+
+  int out_len = strlen (SIGNATURE_BLAKE2B);
+
+  // signature
+  memcpy (out_buf, SIGNATURE_BLAKE2B, out_len);
+
+  // hash
+  u64_to_hex (digest[0], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[1], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[2], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[3], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[4], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[5], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[6], out_buf + out_len); out_len += 16;
+  u64_to_hex (digest[7], out_buf + out_len); out_len += 16;
+
+  // :
+  out_buf[out_len] = hashconfig->separator;
+  out_len += 1;
+
+  // salt
+  out_len += generic_salt_encode (hashconfig, (const u8 *) salt->salt_buf, (const int) salt->salt_len, out_buf + out_len);
+
+  return out_len;
+}
+
+void module_init (module_ctx_t *module_ctx)
+{
+  module_ctx->module_context_size             = MODULE_CONTEXT_SIZE_CURRENT;
+  module_ctx->module_interface_version        = MODULE_INTERFACE_VERSION_CURRENT;
+
+  module_ctx->module_attack_exec              = module_attack_exec;
+  module_ctx->module_benchmark_esalt          = MODULE_DEFAULT;
+  module_ctx->module_benchmark_hook_salt      = MODULE_DEFAULT;
+  module_ctx->module_benchmark_mask           = MODULE_DEFAULT;
+  module_ctx->module_benchmark_salt           = MODULE_DEFAULT;
+  module_ctx->module_build_plain_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_deep_comp_kernel         = MODULE_DEFAULT;
+  module_ctx->module_deprecated_notice        = MODULE_DEFAULT;
+  module_ctx->module_dgst_pos0                = module_dgst_pos0;
+  module_ctx->module_dgst_pos1                = module_dgst_pos1;
+  module_ctx->module_dgst_pos2                = module_dgst_pos2;
+  module_ctx->module_dgst_pos3                = module_dgst_pos3;
+  module_ctx->module_dgst_size                = module_dgst_size;
+  module_ctx->module_dictstat_disable         = MODULE_DEFAULT;
+  module_ctx->module_esalt_size               = MODULE_DEFAULT;
+  module_ctx->module_extra_buffer_size        = MODULE_DEFAULT;
+  module_ctx->module_extra_tmp_size           = MODULE_DEFAULT;
+  module_ctx->module_extra_tuningdb_block     = MODULE_DEFAULT;
+  module_ctx->module_forced_outfile_format    = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_count        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_parse        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_save         = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_zero_hash    = MODULE_DEFAULT;
+  module_ctx->module_hash_decode              = module_hash_decode;
+  module_ctx->module_hash_encode_status       = MODULE_DEFAULT;
+  module_ctx->module_hash_encode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_encode              = module_hash_encode;
+  module_ctx->module_hash_init_selftest       = MODULE_DEFAULT;
+  module_ctx->module_hash_mode                = MODULE_DEFAULT;
+  module_ctx->module_hash_category            = module_hash_category;
+  module_ctx->module_hash_name                = module_hash_name;
+  module_ctx->module_hashes_count_min         = MODULE_DEFAULT;
+  module_ctx->module_hashes_count_max         = MODULE_DEFAULT;
+  module_ctx->module_hlfmt_disable            = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_size    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_init    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_term    = MODULE_DEFAULT;
+  module_ctx->module_hook12                   = MODULE_DEFAULT;
+  module_ctx->module_hook23                   = MODULE_DEFAULT;
+  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
+  module_ctx->module_hook_size                = MODULE_DEFAULT;
+  module_ctx->module_jit_build_options        = MODULE_DEFAULT;
+  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
+  module_ctx->module_kern_type                = module_kern_type;
+  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
+  module_ctx->module_opti_type                = module_opti_type;
+  module_ctx->module_opts_type                = module_opts_type;
+  module_ctx->module_outfile_check_disable    = MODULE_DEFAULT;
+  module_ctx->module_outfile_check_nocomp     = MODULE_DEFAULT;
+  module_ctx->module_potfile_custom_check     = MODULE_DEFAULT;
+  module_ctx->module_potfile_disable          = MODULE_DEFAULT;
+  module_ctx->module_potfile_keep_all_hashes  = MODULE_DEFAULT;
+  module_ctx->module_pwdump_column            = MODULE_DEFAULT;
+  module_ctx->module_pw_max                   = MODULE_DEFAULT;
+  module_ctx->module_pw_min                   = MODULE_DEFAULT;
+  module_ctx->module_salt_max                 = MODULE_DEFAULT;
+  module_ctx->module_salt_min                 = MODULE_DEFAULT;
+  module_ctx->module_salt_type                = module_salt_type;
+  module_ctx->module_separator                = MODULE_DEFAULT;
+  module_ctx->module_st_hash                  = module_st_hash;
+  module_ctx->module_st_pass                  = module_st_pass;
+  module_ctx->module_tmp_size                 = MODULE_DEFAULT;
+  module_ctx->module_unstable_warning         = MODULE_DEFAULT;
+  module_ctx->module_warmup_disable           = MODULE_DEFAULT;
+}
diff --git a/tools/test_modules/m00610.pm b/tools/test_modules/m00610.pm
new file mode 100644
index 000000000..b8f5a224d
--- /dev/null
+++ b/tools/test_modules/m00610.pm
@@ -0,0 +1,44 @@
+#!/usr/bin/env perl
+
+##
+## Author......: See docs/credits.txt
+## License.....: MIT
+##
+
+use strict;
+use warnings;
+
+use Digest::BLAKE2 qw (blake2b_hex);
+
+sub module_constraints { [[0, 256], [0, 256], [0, 64], [0, 64], [0, 64]] }
+
+sub module_generate_hash
+{
+  my $word = shift;
+  my $salt = shift;
+
+  my $digest = blake2b_hex ($word . $salt);
+
+  my $hash = sprintf ("\$BLAKE2\$%s:%s", $digest, $salt);
+
+  return $hash;
+}
+
+sub module_verify_hash
+{
+  my $line = shift;
+
+  my ($hash, $salt, $word) = split (':', $line);
+
+  return unless defined $hash;
+  return unless defined $salt;
+  return unless defined $word;
+
+  my $word_packed = pack_if_HEX_notation ($word);
+
+  my $new_hash = module_generate_hash ($word_packed, $salt);
+
+  return ($new_hash, $word);
+}
+
+1;
diff --git a/tools/test_modules/m00620.pm b/tools/test_modules/m00620.pm
new file mode 100644
index 000000000..b3a9cba65
--- /dev/null
+++ b/tools/test_modules/m00620.pm
@@ -0,0 +1,44 @@
+#!/usr/bin/env perl
+
+##
+## Author......: See docs/credits.txt
+## License.....: MIT
+##
+
+use strict;
+use warnings;
+
+use Digest::BLAKE2 qw (blake2b_hex);
+
+sub module_constraints { [[0, 256], [0, 256], [0, 64], [0, 64], [0, 64]] }
+
+sub module_generate_hash
+{
+  my $word = shift;
+  my $salt = shift;
+
+  my $digest = blake2b_hex ($salt . $word);
+
+  my $hash = sprintf ("\$BLAKE2\$%s:%s", $digest, $salt);
+
+  return $hash;
+}
+
+sub module_verify_hash
+{
+  my $line = shift;
+
+  my ($hash, $salt, $word) = split (':', $line);
+
+  return unless defined $hash;
+  return unless defined $salt;
+  return unless defined $word;
+
+  my $word_packed = pack_if_HEX_notation ($word);
+
+  my $new_hash = module_generate_hash ($word_packed, $salt);
+
+  return ($new_hash, $word);
+}
+
+1;

From 9ce30defcbba758382460bd01e7ce1821e42264b Mon Sep 17 00:00:00 2001
From: tweqx <romla@sfr.fr>
Date: Sat, 21 May 2022 19:32:39 +0200
Subject: [PATCH 2/5] Don't apply the salt in the a3 BLAKE2b($pass.$salt)
 optimized OpenCL code

---
 OpenCL/m00610_a3-optimized.cl | 179 ++++------------------------------
 1 file changed, 18 insertions(+), 161 deletions(-)

diff --git a/OpenCL/m00610_a3-optimized.cl b/OpenCL/m00610_a3-optimized.cl
index 7a406b40e..7402791ef 100644
--- a/OpenCL/m00610_a3-optimized.cl
+++ b/OpenCL/m00610_a3-optimized.cl
@@ -20,38 +20,6 @@ DECLSPEC void m00610m (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
    * modifiers are taken from args
    */
 
-  /**
-   * salt
-   */
-
-  u32 salt_buf0[4];
-  u32 salt_buf1[4];
-  u32 salt_buf2[4];
-  u32 salt_buf3[4];
-
-  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
-  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
-  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
-  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
-  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
-  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
-  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
-  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
-  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
-  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
-  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
-  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
-  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
-  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
-  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
-  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
-
-  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
-
-  const u32 pw_salt_len = pw_len + salt_len;
-
-  switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
   /**
    * loop
    */
@@ -63,59 +31,20 @@ DECLSPEC void m00610m (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
     const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
     const u32x w0x = w0l | w0r;
 
-    u32x w0[4];
-    u32x w1[4];
-    u32x w2[4];
-    u32x w3[4];
-
-    w0[0] = w0x;
-    w0[1] = w[ 1];
-    w0[2] = w[ 2];
-    w0[3] = w[ 3];
-    w1[0] = w[ 4];
-    w1[1] = w[ 5];
-    w1[2] = w[ 6];
-    w1[3] = w[ 7];
-    w2[0] = w[ 8];
-    w2[1] = w[ 9];
-    w2[2] = w[10];
-    w2[3] = w[11];
-    w3[0] = w[12];
-    w3[1] = w[13];
-    w3[2] = w[14];
-    w3[3] = w[15];
-
-    w0[0] |= salt_buf0[0];
-    w0[1] |= salt_buf0[1];
-    w0[2] |= salt_buf0[2];
-    w0[3] |= salt_buf0[3];
-    w1[0] |= salt_buf1[0];
-    w1[1] |= salt_buf1[1];
-    w1[2] |= salt_buf1[2];
-    w1[3] |= salt_buf1[3];
-    w2[0] |= salt_buf2[0];
-    w2[1] |= salt_buf2[1];
-    w2[2] |= salt_buf2[2];
-    w2[3] |= salt_buf2[3];
-    w3[0] |= salt_buf3[0];
-    w3[1] |= salt_buf3[1];
-    w3[2] |= salt_buf3[2];
-    w3[3] |= salt_buf3[3];
-
     /**
      * blake2b
      */
 
     u64x m[16];
 
-    m[ 0] = hl32_to_64 (w0[1], w0[0]);
-    m[ 1] = hl32_to_64 (w0[3], w0[2]);
-    m[ 2] = hl32_to_64 (w1[1], w1[0]);
-    m[ 3] = hl32_to_64 (w1[3], w1[2]);
-    m[ 4] = hl32_to_64 (w2[1], w2[0]);
-    m[ 5] = hl32_to_64 (w2[3], w2[2]);
-    m[ 6] = hl32_to_64 (w3[1], w3[0]);
-    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 0] = hl32_to_64 (w[ 1], w0x  );
+    m[ 1] = hl32_to_64 (w[ 3], w[ 2]);
+    m[ 2] = hl32_to_64 (w[ 5], w[ 4]);
+    m[ 3] = hl32_to_64 (w[ 7], w[ 6]);
+    m[ 4] = hl32_to_64 (w[ 9], w[ 8]);
+    m[ 5] = hl32_to_64 (w[11], w[10]);
+    m[ 6] = hl32_to_64 (w[13], w[12]);
+    m[ 7] = hl32_to_64 (w[15], w[14]);
     m[ 8] = 0;
     m[ 9] = 0;
     m[10] = 0;
@@ -136,7 +65,7 @@ DECLSPEC void m00610m (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
     h[6] = BLAKE2B_IV_06;
     h[7] = BLAKE2B_IV_07;
 
-    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+    blake2b_transform_vector (h, m, pw_len, BLAKE2B_FINAL);
 
     const u32x r0 = h32_from_64 (h[0]);
     const u32x r1 = l32_from_64 (h[0]);
@@ -165,38 +94,6 @@ DECLSPEC void m00610s (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
     digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
   };
 
-  /**
-   * salt
-   */
-
-  u32 salt_buf0[4];
-  u32 salt_buf1[4];
-  u32 salt_buf2[4];
-  u32 salt_buf3[4];
-
-  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
-  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
-  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
-  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
-  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
-  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
-  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
-  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
-  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
-  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
-  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
-  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
-  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
-  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
-  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
-  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
-
-  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
-
-  const u32 pw_salt_len = pw_len + salt_len;
-
-  switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
-
   /**
    * loop
    */
@@ -208,59 +105,20 @@ DECLSPEC void m00610s (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
     const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
     const u32x w0x = w0l | w0r;
 
-    u32x w0[4];
-    u32x w1[4];
-    u32x w2[4];
-    u32x w3[4];
-
-    w0[0] = w0x;
-    w0[1] = w[ 1];
-    w0[2] = w[ 2];
-    w0[3] = w[ 3];
-    w1[0] = w[ 4];
-    w1[1] = w[ 5];
-    w1[2] = w[ 6];
-    w1[3] = w[ 7];
-    w2[0] = w[ 8];
-    w2[1] = w[ 9];
-    w2[2] = w[10];
-    w2[3] = w[11];
-    w3[0] = w[12];
-    w3[1] = w[13];
-    w3[2] = w[14];
-    w3[3] = w[15];
-
-    w0[0] |= salt_buf0[0];
-    w0[1] |= salt_buf0[1];
-    w0[2] |= salt_buf0[2];
-    w0[3] |= salt_buf0[3];
-    w1[0] |= salt_buf1[0];
-    w1[1] |= salt_buf1[1];
-    w1[2] |= salt_buf1[2];
-    w1[3] |= salt_buf1[3];
-    w2[0] |= salt_buf2[0];
-    w2[1] |= salt_buf2[1];
-    w2[2] |= salt_buf2[2];
-    w2[3] |= salt_buf2[3];
-    w3[0] |= salt_buf3[0];
-    w3[1] |= salt_buf3[1];
-    w3[2] |= salt_buf3[2];
-    w3[3] |= salt_buf3[3];
-
     /**
      * blake2b
      */
 
     u64x m[16];
 
-    m[ 0] = hl32_to_64 (w0[1], w0[0]);
-    m[ 1] = hl32_to_64 (w0[3], w0[2]);
-    m[ 2] = hl32_to_64 (w1[1], w1[0]);
-    m[ 3] = hl32_to_64 (w1[3], w1[2]);
-    m[ 4] = hl32_to_64 (w2[1], w2[0]);
-    m[ 5] = hl32_to_64 (w2[3], w2[2]);
-    m[ 6] = hl32_to_64 (w3[1], w3[0]);
-    m[ 7] = hl32_to_64 (w3[3], w3[2]);
+    m[ 0] = hl32_to_64 (w[ 1], w0x  );
+    m[ 1] = hl32_to_64 (w[ 3], w[ 2]);
+    m[ 2] = hl32_to_64 (w[ 5], w[ 4]);
+    m[ 3] = hl32_to_64 (w[ 7], w[ 6]);
+    m[ 4] = hl32_to_64 (w[ 9], w[ 8]);
+    m[ 5] = hl32_to_64 (w[11], w[10]);
+    m[ 6] = hl32_to_64 (w[13], w[12]);
+    m[ 7] = hl32_to_64 (w[15], w[14]);
     m[ 8] = 0;
     m[ 9] = 0;
     m[10] = 0;
@@ -281,7 +139,7 @@ DECLSPEC void m00610s (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
     h[6] = BLAKE2B_IV_06;
     h[7] = BLAKE2B_IV_07;
 
-    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
+    blake2b_transform_vector (h, m, pw_len, BLAKE2B_FINAL);
 
     const u32x r0 = h32_from_64 (h[0]);
     const u32x r1 = l32_from_64 (h[0]);
@@ -531,4 +389,3 @@ KERNEL_FQ void m00610_s16 (KERN_ATTR_VECTOR ())
 
   m00610s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
 }
-

From 97a119da82dc34a52b5a1ae0eef8b55e27f32170 Mon Sep 17 00:00:00 2001
From: tweqx <romla@sfr.fr>
Date: Mon, 23 May 2022 16:53:50 +0200
Subject: [PATCH 3/5] In the unit tests, convert the hexdigest to lowercase to
 match the behavior of 'module_hash_encode'

---
 tools/test_modules/m00610.pm | 2 +-
 tools/test_modules/m00620.pm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/test_modules/m00610.pm b/tools/test_modules/m00610.pm
index b8f5a224d..61ad8e43b 100644
--- a/tools/test_modules/m00610.pm
+++ b/tools/test_modules/m00610.pm
@@ -19,7 +19,7 @@ sub module_generate_hash
 
   my $digest = blake2b_hex ($word . $salt);
 
-  my $hash = sprintf ("\$BLAKE2\$%s:%s", $digest, $salt);
+  my $hash = sprintf ("\$BLAKE2\$%s:%s", lc ($digest), $salt);
 
   return $hash;
 }
diff --git a/tools/test_modules/m00620.pm b/tools/test_modules/m00620.pm
index b3a9cba65..6c0c9e714 100644
--- a/tools/test_modules/m00620.pm
+++ b/tools/test_modules/m00620.pm
@@ -19,7 +19,7 @@ sub module_generate_hash
 
   my $digest = blake2b_hex ($salt . $word);
 
-  my $hash = sprintf ("\$BLAKE2\$%s:%s", $digest, $salt);
+  my $hash = sprintf ("\$BLAKE2\$%s:%s", lc ($digest), $salt);
 
   return $hash;
 }

From ebcf5bfe20136477f2033aeeb82e9440df1d3139 Mon Sep 17 00:00:00 2001
From: tweqx <romla@sfr.fr>
Date: Wed, 25 May 2022 20:43:32 +0200
Subject: [PATCH 4/5] Partially revert 9ce30defc: apply salt in the a3 610
 multi kernel

---
 OpenCL/m00610_a3-optimized.cl | 90 +++++++++++++++++++++++++++++++----
 1 file changed, 81 insertions(+), 9 deletions(-)

diff --git a/OpenCL/m00610_a3-optimized.cl b/OpenCL/m00610_a3-optimized.cl
index 7402791ef..1ebbffb51 100644
--- a/OpenCL/m00610_a3-optimized.cl
+++ b/OpenCL/m00610_a3-optimized.cl
@@ -14,12 +14,45 @@
 #include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
 #endif
 
+
 DECLSPEC void m00610m (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTOR ())
 {
   /**
    * modifiers are taken from args
    */
 
+  /**
+   * salt
+   */
+
+  u32 salt_buf0[4];
+  u32 salt_buf1[4];
+  u32 salt_buf2[4];
+  u32 salt_buf3[4];
+
+  salt_buf0[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 0];
+  salt_buf0[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 1];
+  salt_buf0[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 2];
+  salt_buf0[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 3];
+  salt_buf1[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 4];
+  salt_buf1[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 5];
+  salt_buf1[2] = salt_bufs[SALT_POS_HOST].salt_buf[ 6];
+  salt_buf1[3] = salt_bufs[SALT_POS_HOST].salt_buf[ 7];
+  salt_buf2[0] = salt_bufs[SALT_POS_HOST].salt_buf[ 8];
+  salt_buf2[1] = salt_bufs[SALT_POS_HOST].salt_buf[ 9];
+  salt_buf2[2] = salt_bufs[SALT_POS_HOST].salt_buf[10];
+  salt_buf2[3] = salt_bufs[SALT_POS_HOST].salt_buf[11];
+  salt_buf3[0] = salt_bufs[SALT_POS_HOST].salt_buf[12];
+  salt_buf3[1] = salt_bufs[SALT_POS_HOST].salt_buf[13];
+  salt_buf3[2] = salt_bufs[SALT_POS_HOST].salt_buf[14];
+  salt_buf3[3] = salt_bufs[SALT_POS_HOST].salt_buf[15];
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  const u32 pw_salt_len = pw_len + salt_len;
+
+  switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
+
   /**
    * loop
    */
@@ -31,20 +64,59 @@ DECLSPEC void m00610m (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
     const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
     const u32x w0x = w0l | w0r;
 
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = w0x;
+    w0[1] = w[ 1];
+    w0[2] = w[ 2];
+    w0[3] = w[ 3];
+    w1[0] = w[ 4];
+    w1[1] = w[ 5];
+    w1[2] = w[ 6];
+    w1[3] = w[ 7];
+    w2[0] = w[ 8];
+    w2[1] = w[ 9];
+    w2[2] = w[10];
+    w2[3] = w[11];
+    w3[0] = w[12];
+    w3[1] = w[13];
+    w3[2] = w[14];
+    w3[3] = w[15];
+
+    w0[0] |= salt_buf0[0];
+    w0[1] |= salt_buf0[1];
+    w0[2] |= salt_buf0[2];
+    w0[3] |= salt_buf0[3];
+    w1[0] |= salt_buf1[0];
+    w1[1] |= salt_buf1[1];
+    w1[2] |= salt_buf1[2];
+    w1[3] |= salt_buf1[3];
+    w2[0] |= salt_buf2[0];
+    w2[1] |= salt_buf2[1];
+    w2[2] |= salt_buf2[2];
+    w2[3] |= salt_buf2[3];
+    w3[0] |= salt_buf3[0];
+    w3[1] |= salt_buf3[1];
+    w3[2] |= salt_buf3[2];
+    w3[3] |= salt_buf3[3];
+
     /**
      * blake2b
      */
 
     u64x m[16];
 
-    m[ 0] = hl32_to_64 (w[ 1], w0x  );
-    m[ 1] = hl32_to_64 (w[ 3], w[ 2]);
-    m[ 2] = hl32_to_64 (w[ 5], w[ 4]);
-    m[ 3] = hl32_to_64 (w[ 7], w[ 6]);
-    m[ 4] = hl32_to_64 (w[ 9], w[ 8]);
-    m[ 5] = hl32_to_64 (w[11], w[10]);
-    m[ 6] = hl32_to_64 (w[13], w[12]);
-    m[ 7] = hl32_to_64 (w[15], w[14]);
+    m[ 0] = hl32_to_64 (w0[1], w0[0]);
+    m[ 1] = hl32_to_64 (w0[3], w0[2]);
+    m[ 2] = hl32_to_64 (w1[1], w1[0]);
+    m[ 3] = hl32_to_64 (w1[3], w1[2]);
+    m[ 4] = hl32_to_64 (w2[1], w2[0]);
+    m[ 5] = hl32_to_64 (w2[3], w2[2]);
+    m[ 6] = hl32_to_64 (w3[1], w3[0]);
+    m[ 7] = hl32_to_64 (w3[3], w3[2]);
     m[ 8] = 0;
     m[ 9] = 0;
     m[10] = 0;
@@ -65,7 +137,7 @@ DECLSPEC void m00610m (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO
     h[6] = BLAKE2B_IV_06;
     h[7] = BLAKE2B_IV_07;
 
-    blake2b_transform_vector (h, m, pw_len, BLAKE2B_FINAL);
+    blake2b_transform_vector (h, m, pw_salt_len, BLAKE2B_FINAL);
 
     const u32x r0 = h32_from_64 (h[0]);
     const u32x r1 = l32_from_64 (h[0]);

From 42c4c1d72faf3de0c68d23e29a5fb035746352f9 Mon Sep 17 00:00:00 2001
From: tweqx <romla@sfr.fr>
Date: Wed, 25 May 2022 22:59:27 +0200
Subject: [PATCH 5/5] In the a3 610 kernel single function, apply correctly the
 last two characters of the salt

---
 OpenCL/m00610_a1-optimized.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/OpenCL/m00610_a1-optimized.cl b/OpenCL/m00610_a1-optimized.cl
index e50f6955c..a6728bce4 100644
--- a/OpenCL/m00610_a1-optimized.cl
+++ b/OpenCL/m00610_a1-optimized.cl
@@ -420,8 +420,8 @@ KERNEL_FQ void m00610_s04 (KERN_ATTR_BASIC ())
     w2[3] |= s2[3];
     w3[0] |= s3[0];
     w3[1] |= s3[1];
-    w3[0] |= s3[2];
-    w3[1] |= s3[3];
+    w3[2] |= s3[2];
+    w3[3] |= s3[3];
 
     /**
      * blake2b