Add pure kernel for -m 1800

2025-04-05 09:15:42 +00:00 · 2017-07-02 23:27:54 +02:00 · 2017-07-02 23:27:54 +02:00 · b9b2112b64
commit b9b2112b64
parent 8e1759650b
5 changed files with 635 additions and 12 deletions
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@ -201,6 +201,27 @@ inline void truncate_block (u32x w[4], const u32 len)
  }
 }

+inline void truncate_block_64 (u32x w[16], const u32 len)
+{
+  switch (len / 16)
+  {
+    case 0: truncate_block (w +  0, len & 15);
+            truncate_block (w +  4, 0);
+            truncate_block (w +  8, 0);
+            truncate_block (w + 12, 0);
+            break;
+    case 1: truncate_block (w +  4, len & 15);
+            truncate_block (w +  8, 0);
+            truncate_block (w + 12, 0);
+            break;
+    case 2: truncate_block (w +  8, len & 15);
+            truncate_block (w + 12, 0);
+            break;
+    case 3: truncate_block (w + 12, len & 15);
+            break;
+  }
+}
+
 inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4])
 {
  #ifdef IS_NV
--- a/OpenCL/inc_hash_sha512.cl
+++ b/OpenCL/inc_hash_sha512.cl
@ -394,6 +394,160 @@ void sha512_update (sha512_ctx_t *ctx, const u32 *w, const int len)
  sha512_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - pos1);
 }

+void sha512_update_swap (sha512_ctx_t *ctx, const u32 *w, const int len)
+{
+  u32 w0[4];
+  u32 w1[4];
+  u32 w2[4];
+  u32 w3[4];
+  u32 w4[4];
+  u32 w5[4];
+  u32 w6[4];
+  u32 w7[4];
+
+  int pos1;
+  int pos4;
+
+  for (pos1 = 0, pos4 = 0; pos1 < len - 128; pos1 += 128, pos4 += 32)
+  {
+    w0[0] = w[pos4 +  0];
+    w0[1] = w[pos4 +  1];
+    w0[2] = w[pos4 +  2];
+    w0[3] = w[pos4 +  3];
+    w1[0] = w[pos4 +  4];
+    w1[1] = w[pos4 +  5];
+    w1[2] = w[pos4 +  6];
+    w1[3] = w[pos4 +  7];
+    w2[0] = w[pos4 +  8];
+    w2[1] = w[pos4 +  9];
+    w2[2] = w[pos4 + 10];
+    w2[3] = w[pos4 + 11];
+    w3[0] = w[pos4 + 12];
+    w3[1] = w[pos4 + 13];
+    w3[2] = w[pos4 + 14];
+    w3[3] = w[pos4 + 15];
+    w4[0] = w[pos4 + 16];
+    w4[1] = w[pos4 + 17];
+    w4[2] = w[pos4 + 18];
+    w4[3] = w[pos4 + 19];
+    w5[0] = w[pos4 + 20];
+    w5[1] = w[pos4 + 21];
+    w5[2] = w[pos4 + 22];
+    w5[3] = w[pos4 + 23];
+    w6[0] = w[pos4 + 24];
+    w6[1] = w[pos4 + 25];
+    w6[2] = w[pos4 + 26];
+    w6[3] = w[pos4 + 27];
+    w7[0] = w[pos4 + 28];
+    w7[1] = w[pos4 + 29];
+    w7[2] = w[pos4 + 30];
+    w7[3] = w[pos4 + 31];
+
+    w0[0] = swap32_S (w0[0]);
+    w0[1] = swap32_S (w0[1]);
+    w0[2] = swap32_S (w0[2]);
+    w0[3] = swap32_S (w0[3]);
+    w1[0] = swap32_S (w1[0]);
+    w1[1] = swap32_S (w1[1]);
+    w1[2] = swap32_S (w1[2]);
+    w1[3] = swap32_S (w1[3]);
+    w2[0] = swap32_S (w2[0]);
+    w2[1] = swap32_S (w2[1]);
+    w2[2] = swap32_S (w2[2]);
+    w2[3] = swap32_S (w2[3]);
+    w3[0] = swap32_S (w3[0]);
+    w3[1] = swap32_S (w3[1]);
+    w3[2] = swap32_S (w3[2]);
+    w3[3] = swap32_S (w3[3]);
+    w4[0] = swap32_S (w4[0]);
+    w4[1] = swap32_S (w4[1]);
+    w4[2] = swap32_S (w4[2]);
+    w4[3] = swap32_S (w4[3]);
+    w5[0] = swap32_S (w5[0]);
+    w5[1] = swap32_S (w5[1]);
+    w5[2] = swap32_S (w5[2]);
+    w5[3] = swap32_S (w5[3]);
+    w6[0] = swap32_S (w6[0]);
+    w6[1] = swap32_S (w6[1]);
+    w6[2] = swap32_S (w6[2]);
+    w6[3] = swap32_S (w6[3]);
+    w7[0] = swap32_S (w7[0]);
+    w7[1] = swap32_S (w7[1]);
+    w7[2] = swap32_S (w7[2]);
+    w7[3] = swap32_S (w7[3]);
+
+    sha512_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128);
+  }
+
+  w0[0] = w[pos4 +  0];
+  w0[1] = w[pos4 +  1];
+  w0[2] = w[pos4 +  2];
+  w0[3] = w[pos4 +  3];
+  w1[0] = w[pos4 +  4];
+  w1[1] = w[pos4 +  5];
+  w1[2] = w[pos4 +  6];
+  w1[3] = w[pos4 +  7];
+  w2[0] = w[pos4 +  8];
+  w2[1] = w[pos4 +  9];
+  w2[2] = w[pos4 + 10];
+  w2[3] = w[pos4 + 11];
+  w3[0] = w[pos4 + 12];
+  w3[1] = w[pos4 + 13];
+  w3[2] = w[pos4 + 14];
+  w3[3] = w[pos4 + 15];
+  w4[0] = w[pos4 + 16];
+  w4[1] = w[pos4 + 17];
+  w4[2] = w[pos4 + 18];
+  w4[3] = w[pos4 + 19];
+  w5[0] = w[pos4 + 20];
+  w5[1] = w[pos4 + 21];
+  w5[2] = w[pos4 + 22];
+  w5[3] = w[pos4 + 23];
+  w6[0] = w[pos4 + 24];
+  w6[1] = w[pos4 + 25];
+  w6[2] = w[pos4 + 26];
+  w6[3] = w[pos4 + 27];
+  w7[0] = w[pos4 + 28];
+  w7[1] = w[pos4 + 29];
+  w7[2] = w[pos4 + 30];
+  w7[3] = w[pos4 + 31];
+
+  w0[0] = swap32_S (w0[0]);
+  w0[1] = swap32_S (w0[1]);
+  w0[2] = swap32_S (w0[2]);
+  w0[3] = swap32_S (w0[3]);
+  w1[0] = swap32_S (w1[0]);
+  w1[1] = swap32_S (w1[1]);
+  w1[2] = swap32_S (w1[2]);
+  w1[3] = swap32_S (w1[3]);
+  w2[0] = swap32_S (w2[0]);
+  w2[1] = swap32_S (w2[1]);
+  w2[2] = swap32_S (w2[2]);
+  w2[3] = swap32_S (w2[3]);
+  w3[0] = swap32_S (w3[0]);
+  w3[1] = swap32_S (w3[1]);
+  w3[2] = swap32_S (w3[2]);
+  w3[3] = swap32_S (w3[3]);
+  w4[0] = swap32_S (w4[0]);
+  w4[1] = swap32_S (w4[1]);
+  w4[2] = swap32_S (w4[2]);
+  w4[3] = swap32_S (w4[3]);
+  w5[0] = swap32_S (w5[0]);
+  w5[1] = swap32_S (w5[1]);
+  w5[2] = swap32_S (w5[2]);
+  w5[3] = swap32_S (w5[3]);
+  w6[0] = swap32_S (w6[0]);
+  w6[1] = swap32_S (w6[1]);
+  w6[2] = swap32_S (w6[2]);
+  w6[3] = swap32_S (w6[3]);
+  w7[0] = swap32_S (w7[0]);
+  w7[1] = swap32_S (w7[1]);
+  w7[2] = swap32_S (w7[2]);
+  w7[3] = swap32_S (w7[3]);
+
+  sha512_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - pos1);
+}
+
 void sha512_update_global (sha512_ctx_t *ctx, const __global u32 *w, const int len)
 {
  u32 w0[4];
--- a/OpenCL/inc_types.cl
+++ b/OpenCL/inc_types.cl
@ -1131,10 +1131,15 @@ typedef struct sha256crypt_tmp

 typedef struct sha512crypt_tmp
 {
-  u64  l_alt_result[8];
+  u64 l_alt_result[8];
+  u64 l_p_bytes[2];
+  u64 l_s_bytes[2];

-  u64  l_p_bytes[2];
-  u64  l_s_bytes[2];
+  // pure version
+
+  u32 alt_result[16];
+  u32 p_bytes[64];
+  u32 s_bytes[64];

 } sha512crypt_tmp_t;

--- a/OpenCL/m01800-pure.cl
+++ b/OpenCL/m01800-pure.cl
@ -0,0 +1,438 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#include "inc_vendor.cl"
+#include "inc_hash_constants.h"
+#include "inc_hash_functions.cl"
+#include "inc_types.cl"
+#include "inc_common.cl"
+#include "inc_hash_sha512.cl"
+
+#define COMPARE_S "inc_comp_single.cl"
+#define COMPARE_M "inc_comp_multi.cl"
+
+__kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha512crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+{
+  /**
+   * base
+   */
+
+  const u32 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  /**
+   * init
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  const u32 pw_lenv = ceil ((float) pw_len / 4);
+
+  u32 w[64] = { 0 };
+
+  for (int idx = 0; idx < pw_lenv; idx++)
+  {
+    w[idx] = pws[gid].i[idx];
+
+    barrier (CLK_GLOBAL_MEM_FENCE);
+  }
+
+  for (int idx = 0; idx < pw_lenv; idx++)
+  {
+    w[idx] = swap32 (w[idx]);
+  }
+
+  const u32 salt_len = salt_bufs[salt_pos].salt_len;
+
+  const u32 salt_lenv = ceil ((float) salt_len / 4);
+
+  u32 s[64] = { 0 };
+
+  for (int idx = 0; idx < salt_lenv; idx++)
+  {
+    s[idx] = salt_bufs[salt_pos].salt_buf[idx];
+
+    barrier (CLK_GLOBAL_MEM_FENCE);
+  }
+
+  for (int idx = 0; idx < salt_lenv; idx++)
+  {
+    s[idx] = swap32 (s[idx]);
+  }
+
+  /**
+   * prepare
+   */
+
+  sha512_ctx_t ctx;
+
+  sha512_init (&ctx);
+
+  sha512_update (&ctx, w, pw_len);
+
+  sha512_update (&ctx, s, salt_len);
+
+  sha512_update (&ctx, w, pw_len);
+
+  sha512_final (&ctx);
+
+  u32 final[32] = { 0 };
+
+  final[ 0] = h32_from_64_S (ctx.h[0]);
+  final[ 1] = l32_from_64_S (ctx.h[0]);
+  final[ 2] = h32_from_64_S (ctx.h[1]);
+  final[ 3] = l32_from_64_S (ctx.h[1]);
+  final[ 4] = h32_from_64_S (ctx.h[2]);
+  final[ 5] = l32_from_64_S (ctx.h[2]);
+  final[ 6] = h32_from_64_S (ctx.h[3]);
+  final[ 7] = l32_from_64_S (ctx.h[3]);
+  final[ 8] = h32_from_64_S (ctx.h[4]);
+  final[ 9] = l32_from_64_S (ctx.h[4]);
+  final[10] = h32_from_64_S (ctx.h[5]);
+  final[11] = l32_from_64_S (ctx.h[5]);
+  final[12] = h32_from_64_S (ctx.h[6]);
+  final[13] = l32_from_64_S (ctx.h[6]);
+  final[14] = h32_from_64_S (ctx.h[7]);
+  final[15] = l32_from_64_S (ctx.h[7]);
+
+  // alt_result
+
+  sha512_init (&ctx);
+
+  sha512_update (&ctx, w, pw_len);
+
+  sha512_update (&ctx, s, salt_len);
+
+  int pl;
+
+	for (pl = pw_len; pl > 64; pl -= 64)
+  {
+    sha512_update (&ctx, final, 64);
+  }
+
+  u32 t_final[32] = { 0 };
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) t_final[i] = swap32 (final[i]);
+
+  truncate_block_64 (t_final, pl);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) t_final[i] = swap32 (t_final[i]);
+
+  sha512_update (&ctx, t_final, pl);
+
+  for (int cnt = pw_len; cnt > 0; cnt >>= 1)
+  {
+    if ((cnt & 1) != 0)
+    {
+      sha512_update (&ctx, final, 64);
+    }
+    else
+    {
+      sha512_update (&ctx, w, pw_len);
+    }
+  }
+
+  sha512_final (&ctx);
+
+  tmps[gid].alt_result[ 0] = h32_from_64_S (ctx.h[0]);
+  tmps[gid].alt_result[ 1] = l32_from_64_S (ctx.h[0]);
+  tmps[gid].alt_result[ 2] = h32_from_64_S (ctx.h[1]);
+  tmps[gid].alt_result[ 3] = l32_from_64_S (ctx.h[1]);
+  tmps[gid].alt_result[ 4] = h32_from_64_S (ctx.h[2]);
+  tmps[gid].alt_result[ 5] = l32_from_64_S (ctx.h[2]);
+  tmps[gid].alt_result[ 6] = h32_from_64_S (ctx.h[3]);
+  tmps[gid].alt_result[ 7] = l32_from_64_S (ctx.h[3]);
+  tmps[gid].alt_result[ 8] = h32_from_64_S (ctx.h[4]);
+  tmps[gid].alt_result[ 9] = l32_from_64_S (ctx.h[4]);
+  tmps[gid].alt_result[10] = h32_from_64_S (ctx.h[5]);
+  tmps[gid].alt_result[11] = l32_from_64_S (ctx.h[5]);
+  tmps[gid].alt_result[12] = h32_from_64_S (ctx.h[6]);
+  tmps[gid].alt_result[13] = l32_from_64_S (ctx.h[6]);
+  tmps[gid].alt_result[14] = h32_from_64_S (ctx.h[7]);
+  tmps[gid].alt_result[15] = l32_from_64_S (ctx.h[7]);
+
+  // p_bytes
+
+  sha512_init (&ctx);
+
+  for (u32 j = 0; j < pw_len; j++)
+  {
+    sha512_update (&ctx, w, pw_len);
+  }
+
+  sha512_final (&ctx);
+
+  final[ 0] = h32_from_64_S (ctx.h[0]);
+  final[ 1] = l32_from_64_S (ctx.h[0]);
+  final[ 2] = h32_from_64_S (ctx.h[1]);
+  final[ 3] = l32_from_64_S (ctx.h[1]);
+  final[ 4] = h32_from_64_S (ctx.h[2]);
+  final[ 5] = l32_from_64_S (ctx.h[2]);
+  final[ 6] = h32_from_64_S (ctx.h[3]);
+  final[ 7] = l32_from_64_S (ctx.h[3]);
+  final[ 8] = h32_from_64_S (ctx.h[4]);
+  final[ 9] = l32_from_64_S (ctx.h[4]);
+  final[10] = h32_from_64_S (ctx.h[5]);
+  final[11] = l32_from_64_S (ctx.h[5]);
+  final[12] = h32_from_64_S (ctx.h[6]);
+  final[13] = l32_from_64_S (ctx.h[6]);
+  final[14] = h32_from_64_S (ctx.h[7]);
+  final[15] = l32_from_64_S (ctx.h[7]);
+
+  u32 p_final[64] = { 0 };
+
+  int idx;
+
+	for (pl = pw_len, idx = 0; pl > 64; pl -= 64, idx += 16)
+  {
+    p_final[idx +  0] = final[ 0];
+    p_final[idx +  1] = final[ 1];
+    p_final[idx +  2] = final[ 2];
+    p_final[idx +  3] = final[ 3];
+    p_final[idx +  4] = final[ 4];
+    p_final[idx +  5] = final[ 5];
+    p_final[idx +  6] = final[ 6];
+    p_final[idx +  7] = final[ 7];
+    p_final[idx +  8] = final[ 8];
+    p_final[idx +  9] = final[ 9];
+    p_final[idx + 10] = final[10];
+    p_final[idx + 11] = final[11];
+    p_final[idx + 12] = final[12];
+    p_final[idx + 13] = final[13];
+    p_final[idx + 14] = final[14];
+    p_final[idx + 15] = final[15];
+  }
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]);
+
+  truncate_block_64 (final, pl);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]);
+
+  p_final[idx +  0] = final[ 0];
+  p_final[idx +  1] = final[ 1];
+  p_final[idx +  2] = final[ 2];
+  p_final[idx +  3] = final[ 3];
+  p_final[idx +  4] = final[ 4];
+  p_final[idx +  5] = final[ 5];
+  p_final[idx +  6] = final[ 6];
+  p_final[idx +  7] = final[ 7];
+  p_final[idx +  8] = final[ 8];
+  p_final[idx +  9] = final[ 9];
+  p_final[idx + 10] = final[10];
+  p_final[idx + 11] = final[11];
+  p_final[idx + 12] = final[12];
+  p_final[idx + 13] = final[13];
+  p_final[idx + 14] = final[14];
+  p_final[idx + 15] = final[15];
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 64; i++) tmps[gid].p_bytes[i] = p_final[i];
+
+  // s_bytes
+
+  sha512_init (&ctx);
+
+  for (u32 j = 0; j < 16 + ((tmps[gid].alt_result[ 0] >> 24) & 0xff); j++)
+  {
+    sha512_update (&ctx, s, salt_len);
+  }
+
+  sha512_final (&ctx);
+
+  final[ 0] = h32_from_64_S (ctx.h[0]);
+  final[ 1] = l32_from_64_S (ctx.h[0]);
+  final[ 2] = h32_from_64_S (ctx.h[1]);
+  final[ 3] = l32_from_64_S (ctx.h[1]);
+  final[ 4] = h32_from_64_S (ctx.h[2]);
+  final[ 5] = l32_from_64_S (ctx.h[2]);
+  final[ 6] = h32_from_64_S (ctx.h[3]);
+  final[ 7] = l32_from_64_S (ctx.h[3]);
+  final[ 8] = h32_from_64_S (ctx.h[4]);
+  final[ 9] = l32_from_64_S (ctx.h[4]);
+  final[10] = h32_from_64_S (ctx.h[5]);
+  final[11] = l32_from_64_S (ctx.h[5]);
+  final[12] = h32_from_64_S (ctx.h[6]);
+  final[13] = l32_from_64_S (ctx.h[6]);
+  final[14] = h32_from_64_S (ctx.h[7]);
+  final[15] = l32_from_64_S (ctx.h[7]);
+
+  u32 s_final[64] = { 0 };
+
+	for (pl = salt_len, idx = 0; pl > 64; pl -= 64, idx += 16)
+  {
+    s_final[idx +  0] = final[ 0];
+    s_final[idx +  1] = final[ 1];
+    s_final[idx +  2] = final[ 2];
+    s_final[idx +  3] = final[ 3];
+    s_final[idx +  4] = final[ 4];
+    s_final[idx +  5] = final[ 5];
+    s_final[idx +  6] = final[ 6];
+    s_final[idx +  7] = final[ 7];
+    s_final[idx +  8] = final[ 8];
+    s_final[idx +  9] = final[ 9];
+    s_final[idx + 10] = final[10];
+    s_final[idx + 11] = final[11];
+    s_final[idx + 12] = final[12];
+    s_final[idx + 13] = final[13];
+    s_final[idx + 14] = final[14];
+    s_final[idx + 15] = final[15];
+  }
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]);
+
+  truncate_block_64 (final, pl);
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]);
+
+  s_final[idx +  0] = final[ 0];
+  s_final[idx +  1] = final[ 1];
+  s_final[idx +  2] = final[ 2];
+  s_final[idx +  3] = final[ 3];
+  s_final[idx +  4] = final[ 4];
+  s_final[idx +  5] = final[ 5];
+  s_final[idx +  6] = final[ 6];
+  s_final[idx +  7] = final[ 7];
+  s_final[idx +  8] = final[ 8];
+  s_final[idx +  9] = final[ 9];
+  s_final[idx + 10] = final[10];
+  s_final[idx + 11] = final[11];
+  s_final[idx + 12] = final[12];
+  s_final[idx + 13] = final[13];
+  s_final[idx + 14] = final[14];
+  s_final[idx + 15] = final[15];
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 64; i++) tmps[gid].s_bytes[i] = s_final[i];
+}
+
+__kernel void m01800_loop (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha512crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+{
+  /**
+   * base
+   */
+
+  const u32 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  const u32 salt_len = salt_bufs[salt_pos].salt_len;
+
+  u32 alt_result[32] = { 0 };
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) alt_result[i] = tmps[gid].alt_result[i];
+
+  /* Repeatedly run the collected hash value through SHA512 to burn
+     CPU cycles.  */
+
+  for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++)
+  {
+    sha512_ctx_t ctx;
+
+    sha512_init (&ctx);
+
+		if (j & 1)
+    {
+			sha512_update_global (&ctx, tmps[gid].p_bytes, pw_len);
+    }
+		else
+    {
+			sha512_update (&ctx, alt_result, 64);
+    }
+
+		if (j % 3)
+    {
+			sha512_update_global (&ctx, tmps[gid].s_bytes, salt_len);
+    }
+
+		if (j % 7)
+    {
+			sha512_update_global (&ctx, tmps[gid].p_bytes, pw_len);
+    }
+
+		if (j & 1)
+    {
+			sha512_update (&ctx, alt_result, 64);
+    }
+		else
+    {
+			sha512_update_global (&ctx, tmps[gid].p_bytes, pw_len);
+    }
+
+    sha512_final (&ctx);
+
+    alt_result[ 0] = h32_from_64_S (ctx.h[0]);
+    alt_result[ 1] = l32_from_64_S (ctx.h[0]);
+    alt_result[ 2] = h32_from_64_S (ctx.h[1]);
+    alt_result[ 3] = l32_from_64_S (ctx.h[1]);
+    alt_result[ 4] = h32_from_64_S (ctx.h[2]);
+    alt_result[ 5] = l32_from_64_S (ctx.h[2]);
+    alt_result[ 6] = h32_from_64_S (ctx.h[3]);
+    alt_result[ 7] = l32_from_64_S (ctx.h[3]);
+    alt_result[ 8] = h32_from_64_S (ctx.h[4]);
+    alt_result[ 9] = l32_from_64_S (ctx.h[4]);
+    alt_result[10] = h32_from_64_S (ctx.h[5]);
+    alt_result[11] = l32_from_64_S (ctx.h[5]);
+    alt_result[12] = h32_from_64_S (ctx.h[6]);
+    alt_result[13] = l32_from_64_S (ctx.h[6]);
+    alt_result[14] = h32_from_64_S (ctx.h[7]);
+    alt_result[15] = l32_from_64_S (ctx.h[7]);
+  }
+
+  #ifdef _unroll
+  #pragma unroll
+  #endif
+  for (int i = 0; i < 16; i++) tmps[gid].alt_result[i] = alt_result[i];
+}
+
+__kernel void m01800_comp (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha512crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
+{
+  /**
+   * base
+   */
+
+  const u32 gid = get_global_id (0);
+
+  if (gid >= gid_max) return;
+
+  const u32 lid = get_local_id (0);
+
+  const u32 r0 = swap32 (tmps[gid].alt_result[0]);
+  const u32 r1 = swap32 (tmps[gid].alt_result[1]);
+  const u32 r2 = swap32 (tmps[gid].alt_result[2]);
+  const u32 r3 = swap32 (tmps[gid].alt_result[3]);
+
+  #define il_pos 0
+
+  #include COMPARE_M
+}
--- a/include/interface.h
+++ b/include/interface.h
@ -518,15 +518,6 @@ typedef struct md5crypt_tmp

 } md5crypt_tmp_t;

-typedef struct sha512crypt_tmp
-{
-  u64  l_alt_result[8];
-
-  u64  l_p_bytes[2];
-  u64  l_s_bytes[2];
-
-} sha512crypt_tmp_t;
-
 typedef struct sha256crypt_tmp
 {
  u32 alt_result[8];
@ -536,6 +527,20 @@ typedef struct sha256crypt_tmp

 } sha256crypt_tmp_t;

+typedef struct sha512crypt_tmp
+{
+  u64 l_alt_result[8];
+  u64 l_p_bytes[2];
+  u64 l_s_bytes[2];
+
+  // pure version
+
+  u32 alt_result[16];
+  u32 p_bytes[64];
+  u32 s_bytes[64];
+
+} sha512crypt_tmp_t;
+
 typedef struct wpa_tmp
 {
  u32 ipad[5];