From 47509b2954e99f838c3ce20da86b47de49088ad1 Mon Sep 17 00:00:00 2001
From: fse-a <p.barens@nfi.nl>
Date: Thu, 15 Feb 2024 15:17:29 +0100
Subject: [PATCH] Improve performance of scrypt-based algorithms by code
 reordering.

---
 OpenCL/m08900-pure.cl | 91 +++++++++++++++++++++++++++---------------
 OpenCL/m15700-pure.cl | 92 ++++++++++++++++++++++++++++---------------
 OpenCL/m22700-pure.cl | 91 +++++++++++++++++++++++++++---------------
 OpenCL/m27700-pure.cl | 91 +++++++++++++++++++++++++++---------------
 OpenCL/m28200-pure.cl | 92 ++++++++++++++++++++++++++++---------------
 5 files changed, 302 insertions(+), 155 deletions(-)

diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl
index 3064263a2..b74b3d9c6 100644
--- a/OpenCL/m08900-pure.cl
+++ b/OpenCL/m08900-pure.cl
@@ -128,6 +128,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
 {
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4 / 2];
+
+  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TT[dst_off + 0] = TI[src_off + 0];
+    TT[dst_off + 1] = TI[src_off + 1];
+    TT[dst_off + 2] = TI[src_off + 2];
+    TT[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TI[dst_off + 0] = TI[src_off + 0];
+    TI[dst_off + 1] = TI[src_off + 1];
+    TI[dst_off + 2] = TI[src_off + 2];
+    TI[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+  {
+    TI[dst_off + 0] = TT[src_off + 0];
+    TI[dst_off + 1] = TT[src_off + 1];
+    TI[dst_off + 2] = TT[src_off + 2];
+    TI[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   uint4 R0 = TI[STATE_CNT4 - 4];
   uint4 R1 = TI[STATE_CNT4 - 3];
   uint4 R2 = TI[STATE_CNT4 - 2];
@@ -165,36 +195,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
     TI[i + 2] = R2;
     TI[i + 3] = R3;
   }
-
-  #if SCRYPT_R > 1
-
-  uint4 TT[STATE_CNT4 / 2];
-
-  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TT[dst_off + 0] = TI[src_off + 0];
-    TT[dst_off + 1] = TI[src_off + 1];
-    TT[dst_off + 2] = TI[src_off + 2];
-    TT[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TI[dst_off + 0] = TI[src_off + 0];
-    TI[dst_off + 1] = TI[src_off + 1];
-    TI[dst_off + 2] = TI[src_off + 2];
-    TI[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
-  {
-    TI[dst_off + 0] = TT[src_off + 0];
-    TI[dst_off + 1] = TT[src_off + 1];
-    TI[dst_off + 2] = TT[src_off + 2];
-    TI[dst_off + 3] = TT[src_off + 3];
-  }
-
-  #endif
 }
 
 DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -217,6 +217,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
     case 3: V = V3; break;
   }
 
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4];
+
+  for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+  for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   for (u32 y = 0; y < ySIZE; y++)
   {
     for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -459,10 +483,14 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
 
   sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
 
-  for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+  for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
   {
+   for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+   {
     uint4 X[4];
 
+    const u32 l =  i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
     X[0] = tmps[gid].P[l + 0];
     X[1] = tmps[gid].P[l + 1];
     X[2] = tmps[gid].P[l + 2];
@@ -510,6 +538,7 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
     w3[3] = T[3].w;
 
     sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+   }
   }
 
   w0[0] = 1;
diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl
index e500b4f70..d435883ce 100644
--- a/OpenCL/m15700-pure.cl
+++ b/OpenCL/m15700-pure.cl
@@ -135,6 +135,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
 {
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4 / 2];
+
+  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TT[dst_off + 0] = TI[src_off + 0];
+    TT[dst_off + 1] = TI[src_off + 1];
+    TT[dst_off + 2] = TI[src_off + 2];
+    TT[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TI[dst_off + 0] = TI[src_off + 0];
+    TI[dst_off + 1] = TI[src_off + 1];
+    TI[dst_off + 2] = TI[src_off + 2];
+    TI[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+  {
+    TI[dst_off + 0] = TT[src_off + 0];
+    TI[dst_off + 1] = TT[src_off + 1];
+    TI[dst_off + 2] = TT[src_off + 2];
+    TI[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   uint4 R0 = TI[STATE_CNT4 - 4];
   uint4 R1 = TI[STATE_CNT4 - 3];
   uint4 R2 = TI[STATE_CNT4 - 2];
@@ -172,36 +202,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
     TI[i + 2] = R2;
     TI[i + 3] = R3;
   }
-
-  #if SCRYPT_R > 1
-
-  uint4 TT[STATE_CNT4 / 2];
-
-  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TT[dst_off + 0] = TI[src_off + 0];
-    TT[dst_off + 1] = TI[src_off + 1];
-    TT[dst_off + 2] = TI[src_off + 2];
-    TT[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TI[dst_off + 0] = TI[src_off + 0];
-    TI[dst_off + 1] = TI[src_off + 1];
-    TI[dst_off + 2] = TI[src_off + 2];
-    TI[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
-  {
-    TI[dst_off + 0] = TT[src_off + 0];
-    TI[dst_off + 1] = TT[src_off + 1];
-    TI[dst_off + 2] = TT[src_off + 2];
-    TI[dst_off + 3] = TT[src_off + 3];
-  }
-
-  #endif
 }
 
 DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -224,6 +224,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
     case 3: V = V3; break;
   }
 
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4];
+
+  for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+  for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   for (u32 y = 0; y < ySIZE; y++)
   {
     for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -595,10 +619,15 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
 
   sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
 
-  for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+
+  for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
   {
+   for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+   {
     uint4 X[4];
 
+    const u32 l =  i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
     X[0] = tmps[gid].P[l + 0];
     X[1] = tmps[gid].P[l + 1];
     X[2] = tmps[gid].P[l + 2];
@@ -646,6 +675,7 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
     w3[3] = T[3].w;
 
     sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+   }
   }
 
   w0[0] = 1;
diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl
index a29df1c03..303e5e334 100644
--- a/OpenCL/m22700-pure.cl
+++ b/OpenCL/m22700-pure.cl
@@ -176,6 +176,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
 {
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4 / 2];
+
+  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TT[dst_off + 0] = TI[src_off + 0];
+    TT[dst_off + 1] = TI[src_off + 1];
+    TT[dst_off + 2] = TI[src_off + 2];
+    TT[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TI[dst_off + 0] = TI[src_off + 0];
+    TI[dst_off + 1] = TI[src_off + 1];
+    TI[dst_off + 2] = TI[src_off + 2];
+    TI[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+  {
+    TI[dst_off + 0] = TT[src_off + 0];
+    TI[dst_off + 1] = TT[src_off + 1];
+    TI[dst_off + 2] = TT[src_off + 2];
+    TI[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   uint4 R0 = TI[STATE_CNT4 - 4];
   uint4 R1 = TI[STATE_CNT4 - 3];
   uint4 R2 = TI[STATE_CNT4 - 2];
@@ -213,36 +243,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
     TI[i + 2] = R2;
     TI[i + 3] = R3;
   }
-
-  #if SCRYPT_R > 1
-
-  uint4 TT[STATE_CNT4 / 2];
-
-  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TT[dst_off + 0] = TI[src_off + 0];
-    TT[dst_off + 1] = TI[src_off + 1];
-    TT[dst_off + 2] = TI[src_off + 2];
-    TT[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TI[dst_off + 0] = TI[src_off + 0];
-    TI[dst_off + 1] = TI[src_off + 1];
-    TI[dst_off + 2] = TI[src_off + 2];
-    TI[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
-  {
-    TI[dst_off + 0] = TT[src_off + 0];
-    TI[dst_off + 1] = TT[src_off + 1];
-    TI[dst_off + 2] = TT[src_off + 2];
-    TI[dst_off + 3] = TT[src_off + 3];
-  }
-
-  #endif
 }
 
 DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -265,6 +265,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
     case 3: V = V3; break;
   }
 
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4];
+
+  for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+  for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   for (u32 y = 0; y < ySIZE; y++)
   {
     for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -597,10 +621,14 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
   u32 w2[4];
   u32 w3[4];
 
-  for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+  for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
   {
+   for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+   {
     uint4 X[4];
 
+    const u32 l =  i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
     X[0] = tmps[gid].P[l + 0];
     X[1] = tmps[gid].P[l + 1];
     X[2] = tmps[gid].P[l + 2];
@@ -648,6 +676,7 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
     w3[3] = T[3].w;
 
     sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+   }
   }
 
   w0[0] = 1;
diff --git a/OpenCL/m27700-pure.cl b/OpenCL/m27700-pure.cl
index c62dc90d6..d9bf11510 100644
--- a/OpenCL/m27700-pure.cl
+++ b/OpenCL/m27700-pure.cl
@@ -126,6 +126,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
 {
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4 / 2];
+
+  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TT[dst_off + 0] = TI[src_off + 0];
+    TT[dst_off + 1] = TI[src_off + 1];
+    TT[dst_off + 2] = TI[src_off + 2];
+    TT[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TI[dst_off + 0] = TI[src_off + 0];
+    TI[dst_off + 1] = TI[src_off + 1];
+    TI[dst_off + 2] = TI[src_off + 2];
+    TI[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+  {
+    TI[dst_off + 0] = TT[src_off + 0];
+    TI[dst_off + 1] = TT[src_off + 1];
+    TI[dst_off + 2] = TT[src_off + 2];
+    TI[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   uint4 R0 = TI[STATE_CNT4 - 4];
   uint4 R1 = TI[STATE_CNT4 - 3];
   uint4 R2 = TI[STATE_CNT4 - 2];
@@ -163,36 +193,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
     TI[i + 2] = R2;
     TI[i + 3] = R3;
   }
-
-  #if SCRYPT_R > 1
-
-  uint4 TT[STATE_CNT4 / 2];
-
-  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TT[dst_off + 0] = TI[src_off + 0];
-    TT[dst_off + 1] = TI[src_off + 1];
-    TT[dst_off + 2] = TI[src_off + 2];
-    TT[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TI[dst_off + 0] = TI[src_off + 0];
-    TI[dst_off + 1] = TI[src_off + 1];
-    TI[dst_off + 2] = TI[src_off + 2];
-    TI[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
-  {
-    TI[dst_off + 0] = TT[src_off + 0];
-    TI[dst_off + 1] = TT[src_off + 1];
-    TI[dst_off + 2] = TT[src_off + 2];
-    TI[dst_off + 3] = TT[src_off + 3];
-  }
-
-  #endif
 }
 
 DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -215,6 +215,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
     case 3: V = V3; break;
   }
 
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4];
+
+  for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+  for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   for (u32 y = 0; y < ySIZE; y++)
   {
     for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -549,10 +573,14 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
   u32 w2[4];
   u32 w3[4];
 
-  for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+  for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
   {
+   for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+   {
     uint4 X[4];
 
+    const u32 l =  i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
     X[0] = tmps[gid].P[l + 0];
     X[1] = tmps[gid].P[l + 1];
     X[2] = tmps[gid].P[l + 2];
@@ -600,6 +628,7 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
     w3[3] = T[3].w;
 
     sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+   }
   }
 
   w0[0] = 1;
diff --git a/OpenCL/m28200-pure.cl b/OpenCL/m28200-pure.cl
index 2260e931b..58106a007 100644
--- a/OpenCL/m28200-pure.cl
+++ b/OpenCL/m28200-pure.cl
@@ -138,6 +138,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
 
 DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
 {
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4 / 2];
+
+  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TT[dst_off + 0] = TI[src_off + 0];
+    TT[dst_off + 1] = TI[src_off + 1];
+    TT[dst_off + 2] = TI[src_off + 2];
+    TT[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+  {
+    TI[dst_off + 0] = TI[src_off + 0];
+    TI[dst_off + 1] = TI[src_off + 1];
+    TI[dst_off + 2] = TI[src_off + 2];
+    TI[dst_off + 3] = TI[src_off + 3];
+  }
+
+  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+  {
+    TI[dst_off + 0] = TT[src_off + 0];
+    TI[dst_off + 1] = TT[src_off + 1];
+    TI[dst_off + 2] = TT[src_off + 2];
+    TI[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   uint4 R0 = TI[STATE_CNT4 - 4];
   uint4 R1 = TI[STATE_CNT4 - 3];
   uint4 R2 = TI[STATE_CNT4 - 2];
@@ -175,38 +205,9 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
     TI[i + 2] = R2;
     TI[i + 3] = R3;
   }
-
-  #if SCRYPT_R > 1
-
-  uint4 TT[STATE_CNT4 / 2];
-
-  for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TT[dst_off + 0] = TI[src_off + 0];
-    TT[dst_off + 1] = TI[src_off + 1];
-    TT[dst_off + 2] = TI[src_off + 2];
-    TT[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
-  {
-    TI[dst_off + 0] = TI[src_off + 0];
-    TI[dst_off + 1] = TI[src_off + 1];
-    TI[dst_off + 2] = TI[src_off + 2];
-    TI[dst_off + 3] = TI[src_off + 3];
-  }
-
-  for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
-  {
-    TI[dst_off + 0] = TT[src_off + 0];
-    TI[dst_off + 1] = TT[src_off + 1];
-    TI[dst_off + 2] = TT[src_off + 2];
-    TI[dst_off + 3] = TT[src_off + 3];
-  }
-
-  #endif
 }
 
+
 DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
 {
   const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
@@ -227,6 +228,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
     case 3: V = V3; break;
   }
 
+  #if SCRYPT_R > 1
+
+  uint4 TT[STATE_CNT4];
+
+  for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+  for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+  {
+    X[dst_off + 0] = TT[src_off + 0];
+    X[dst_off + 1] = TT[src_off + 1];
+    X[dst_off + 2] = TT[src_off + 2];
+    X[dst_off + 3] = TT[src_off + 3];
+  }
+
+  #endif
+
   for (u32 y = 0; y < ySIZE; y++)
   {
     for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -517,10 +542,14 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
 
   sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
 
-  for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+  for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
   {
+   for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+   {
     uint4 X[4];
 
+    const u32 l =  i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
     X[0] = tmps[gid].P[l + 0];
     X[1] = tmps[gid].P[l + 1];
     X[2] = tmps[gid].P[l + 2];
@@ -568,6 +597,7 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
     w3[3] = T[3].w;
 
     sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+   }
   }
 
   w0[0] = 1;