From 8c5c225d8f009148decdf7097365e44d9928fe0a Mon Sep 17 00:00:00 2001
From: unix-ninja <chris@unix-ninja.com>
Date: Mon, 22 Oct 2018 13:27:35 -0400
Subject: [PATCH] Optimize performance on NVIDIA GTX

---
 OpenCL/m18100_a0-pure.cl | 70 +++++++++++++++++-----------------
 OpenCL/m18100_a1-pure.cl | 70 +++++++++++++++++-----------------
 OpenCL/m18100_a3-pure.cl | 82 +++++++++++++++++++---------------------
 3 files changed, 110 insertions(+), 112 deletions(-)

diff --git a/OpenCL/m18100_a0-pure.cl b/OpenCL/m18100_a0-pure.cl
index bb0894e19..9c589b1b3 100644
--- a/OpenCL/m18100_a0-pure.cl
+++ b/OpenCL/m18100_a0-pure.cl
@@ -59,29 +59,30 @@ __kernel void m18100_mxx (__global pw_t *pws, __constant const kernel_rule_t *ru
 
     sha1_hmac_final (&ctx);
 
-    // calculate the offset using the least 4 bits of the last byte of our hash
-    const u32x otp_offset = ctx.opad.h[4] & 0xf;
-
     // initialize a buffer for the otp code
     u32 otp_code = 0;
 
     // grab 4 consecutive bytes of the hash, starting at offset
-    // on some systems, &3 is faster than %4, so we will use it in our switch()
-    switch (otp_offset & 3)
+    switch (ctx.opad.h[4] & 15)
     {
-    case 1:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) <<  8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24);
-      break;
-    case 2:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16);
-      break;
-    case 3:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >>  8);
-      break;
-    default:
-      otp_code = ctx.opad.h[otp_offset/4];
-      break;
+      case  0: otp_code = ctx.opad.h[0];                              break;
+      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+      case  4: otp_code = ctx.opad.h[1];                              break;
+      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+      case  8: otp_code = ctx.opad.h[2];                              break;
+      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+      case 12: otp_code = ctx.opad.h[3];                              break;
+      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
     }
+
     // take only the lower 31 bits
     otp_code &= 0x7fffffff;
     // we want to generate only 6 digits of code
@@ -152,29 +153,30 @@ __kernel void m18100_sxx (__global pw_t *pws, __constant const kernel_rule_t *ru
 
     sha1_hmac_final (&ctx);
 
-    // calculate the offset using the least 4 bits of the last byte of our hash
-    const u32x otp_offset = ctx.opad.h[4] & 0xf;
-
     // initialize a buffer for the otp code
     u32 otp_code = 0;
 
     // grab 4 consecutive bytes of the hash, starting at offset
-    // on some systems, &3 is faster than %4, so we will use it in our switch()
-    switch (otp_offset & 3)
+    switch (ctx.opad.h[4] & 15)
     {
-    case 1:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) <<  8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24);
-      break;
-    case 2:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16);
-      break;
-    case 3:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >>  8);
-      break;
-    default:
-      otp_code = ctx.opad.h[otp_offset/4];
-      break;
+      case  0: otp_code = ctx.opad.h[0];                              break;
+      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+      case  4: otp_code = ctx.opad.h[1];                              break;
+      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+      case  8: otp_code = ctx.opad.h[2];                              break;
+      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+      case 12: otp_code = ctx.opad.h[3];                              break;
+      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
     }
+
     // take only the lower 31 bits
     otp_code &= 0x7fffffff;
     // we want to generate only 6 digits of code
diff --git a/OpenCL/m18100_a1-pure.cl b/OpenCL/m18100_a1-pure.cl
index 815eec778..940a7c5da 100644
--- a/OpenCL/m18100_a1-pure.cl
+++ b/OpenCL/m18100_a1-pure.cl
@@ -82,29 +82,30 @@ __kernel void m18100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule
 
     sha1_hmac_final (&ctx);
 
-    // calculate the offset using the least 4 bits of the last byte of our hash
-    const u32x otp_offset = ctx.opad.h[4] & 0xf;
-
     // initialize a buffer for the otp code
     u32 otp_code = 0;
 
     // grab 4 consecutive bytes of the hash, starting at offset
-    // on some systems, &3 is faster than %4, so we will use it in our switch()
-    switch (otp_offset & 3)
+    switch (ctx.opad.h[4] & 15)
     {
-    case 1:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) <<  8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24);
-      break;
-    case 2:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16);
-      break;
-    case 3:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >>  8);
-      break;
-    default:
-      otp_code = ctx.opad.h[otp_offset/4];
-      break;
+      case  0: otp_code = ctx.opad.h[0];                              break;
+      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+      case  4: otp_code = ctx.opad.h[1];                              break;
+      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+      case  8: otp_code = ctx.opad.h[2];                              break;
+      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+      case 12: otp_code = ctx.opad.h[3];                              break;
+      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
     }
+
     // take only the lower 31 bits
     otp_code &= 0x7fffffff;
     // we want to generate only 6 digits of code
@@ -200,29 +201,30 @@ __kernel void m18100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule
 
     sha1_hmac_final (&ctx);
 
-    // calculate the offset using the least 4 bits of the last byte of our hash
-    const u32x otp_offset = ctx.opad.h[4] & 0xf;
-
     // initialize a buffer for the otp code
     u32 otp_code = 0;
 
     // grab 4 consecutive bytes of the hash, starting at offset
-    // on some systems, &3 is faster than %4, so we will use it in our switch()
-    switch (otp_offset & 3)
+    switch (ctx.opad.h[4] & 15)
     {
-    case 1:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) <<  8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24);
-      break;
-    case 2:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16);
-      break;
-    case 3:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >>  8);
-      break;
-    default:
-      otp_code = ctx.opad.h[otp_offset/4];
-      break;
+      case  0: otp_code = ctx.opad.h[0];                              break;
+      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+      case  4: otp_code = ctx.opad.h[1];                              break;
+      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+      case  8: otp_code = ctx.opad.h[2];                              break;
+      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+      case 12: otp_code = ctx.opad.h[3];                              break;
+      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
     }
+
     // take only the lower 31 bits
     otp_code &= 0x7fffffff;
     // we want to generate only 6 digits of code
diff --git a/OpenCL/m18100_a3-pure.cl b/OpenCL/m18100_a3-pure.cl
index a71516aa8..d622be763 100644
--- a/OpenCL/m18100_a3-pure.cl
+++ b/OpenCL/m18100_a3-pure.cl
@@ -68,39 +68,36 @@ __kernel void m18100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule
 
     sha1_hmac_final_vector (&ctx);
 
-    // calculate the offset using the least 4 bits of the last byte of our hash
-    const u32x otp_offset = ctx.opad.h[4] & 0xf;
-
     // initialize a buffer for the otp code
     u32 otp_code = 0;
 
     // grab 4 consecutive bytes of the hash, starting at offset
-    // on some systems, &3 is faster than %4, so we will use it in our switch()
-    switch (otp_offset & 3)
+    switch (ctx.opad.h[4] & 15)
     {
-    case 1:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) <<  8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24);
-      break;
-    case 2:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16);
-      break;
-    case 3:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >>  8);
-      break;
-    default:
-      otp_code = ctx.opad.h[otp_offset/4];
-      break;
+      case  0: otp_code = ctx.opad.h[0];                              break;
+      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+      case  4: otp_code = ctx.opad.h[1];                              break;
+      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+      case  8: otp_code = ctx.opad.h[2];                              break;
+      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+      case 12: otp_code = ctx.opad.h[3];                              break;
+      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
     }
+
     // take only the lower 31 bits
     otp_code &= 0x7fffffff;
+
     // we want to generate only 6 digits of code
     otp_code %= 1000000;
 
-    const u32x r0 = ctx.opad.h[DGST_R0];
-    const u32x r1 = ctx.opad.h[DGST_R1];
-    const u32x r2 = ctx.opad.h[DGST_R2];
-    const u32x r3 = ctx.opad.h[DGST_R3];
-
     COMPARE_M_SIMD (otp_code, 0, 0, 0);
   }
 }
@@ -172,39 +169,36 @@ __kernel void m18100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule
 
     sha1_hmac_final_vector (&ctx);
 
-    // calculate the offset using the least 4 bits of the last byte of our hash
-    const u32x otp_offset = ctx.opad.h[4] & 0xf;
-
     // initialize a buffer for the otp code
     u32 otp_code = 0;
 
     // grab 4 consecutive bytes of the hash, starting at offset
-    // on some systems, &3 is faster than %4, so we will use it in our switch()
-    switch (otp_offset & 3)
+    switch (ctx.opad.h[4] & 15)
     {
-    case 1:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) <<  8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24);
-      break;
-    case 2:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16);
-      break;
-    case 3:
-      otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >>  8);
-      break;
-    default:
-      otp_code = ctx.opad.h[otp_offset/4];
-      break;
+      case  0: otp_code = ctx.opad.h[0];                              break;
+      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+      case  4: otp_code = ctx.opad.h[1];                              break;
+      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+      case  8: otp_code = ctx.opad.h[2];                              break;
+      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+      case 12: otp_code = ctx.opad.h[3];                              break;
+      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
     }
+
     // take only the lower 31 bits
     otp_code &= 0x7fffffff;
+
     // we want to generate only 6 digits of code
     otp_code %= 1000000;
 
-    const u32x r0 = ctx.opad.h[DGST_R0];
-    const u32x r1 = ctx.opad.h[DGST_R1];
-    const u32x r2 = ctx.opad.h[DGST_R2];
-    const u32x r3 = ctx.opad.h[DGST_R3];
-
     COMPARE_S_SIMD (otp_code, 0, 0, 0);
   }
 }