From 349b3c46736726b367c10eafe8eb004edbab98a4 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Thu, 2 Jan 2020 11:59:37 +0100
Subject: [PATCH] Fix Bitlocker in OpenCL mode on NV

---
 OpenCL/m22100-pure.cl | 156 +++++++++++++++++++++++++++++-------------
 1 file changed, 110 insertions(+), 46 deletions(-)

diff --git a/OpenCL/m22100-pure.cl b/OpenCL/m22100-pure.cl
index 270cc52ef..bfb4b7908 100644
--- a/OpenCL/m22100-pure.cl
+++ b/OpenCL/m22100-pure.cl
@@ -199,32 +199,29 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
   const u64 lid = get_local_id (0);
   const u64 lsz = get_local_size (0);
 
-  /**
-   * load 256 full w[] precomputed KE buffers into shared memory since its all static data
-   * in order for this to work we need to set a fixed loop count to 256
-   */
+  // init
 
-  #ifdef REAL_SHM
+  u32x w0[4];
+  u32x w1[4];
+  u32x w2[4];
+  u32x w3[4];
 
-  LOCAL_VK u32 s_wb_ke_pc[256][48];
-
-  for (u32 i = lid; i < 256; i += lsz)
-  {
-    for (u32 j = 0; j < 48; j++) // first 16 set to register
-    {
-      s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + i][j];
-    }
-  }
-
-  SYNC_THREADS ();
-
-  #else
-
-  GLOBAL_AS u32 (*s_wb_ke_pc)[48] = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos];
-
-  #endif
-
-  if ((gid * VECT_SIZE) >= gid_max) return;
+  w0[0] = packv (tmps, last_hash, gid, 0); // last_hash
+  w0[1] = packv (tmps, last_hash, gid, 1);
+  w0[2] = packv (tmps, last_hash, gid, 2);
+  w0[3] = packv (tmps, last_hash, gid, 3);
+  w1[0] = packv (tmps, last_hash, gid, 4);
+  w1[1] = packv (tmps, last_hash, gid, 5);
+  w1[2] = packv (tmps, last_hash, gid, 6);
+  w1[3] = packv (tmps, last_hash, gid, 7);
+  w2[0] = packv (tmps, init_hash, gid, 0); // init_hash
+  w2[1] = packv (tmps, init_hash, gid, 1);
+  w2[2] = packv (tmps, init_hash, gid, 2);
+  w2[3] = packv (tmps, init_hash, gid, 3);
+  w3[0] = packv (tmps, init_hash, gid, 4);
+  w3[1] = packv (tmps, init_hash, gid, 5);
+  w3[2] = packv (tmps, init_hash, gid, 6);
+  w3[3] = packv (tmps, init_hash, gid, 7);
 
   // salt to register
 
@@ -250,33 +247,46 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
   t3[2] = 0;
   t3[3] = 88 * 8;
 
-  // init
+  /**
+   * load FIXED_ITER_COUNT full w[] precomputed KE buffers into shared memory since its all static data
+   * in order for this to work we need to set a fixed loop count to FIXED_ITER_COUNT
+   * We also need to handle OpenCL and CUDA differently because of:
+   * ptxas error   : Entry function 'm22100_loop' uses too much shared data (0xc004 bytes, 0xc000 max)
+   */
 
-  u32x w0[4];
-  u32x w1[4];
-  u32x w2[4];
-  u32x w3[4];
+  #ifdef IS_CUDA
+  #define FIXED_ITER_COUNT 256
+  #else
+  #define FIXED_ITER_COUNT 128
+  #endif
 
-  w0[0] = packv (tmps, last_hash, gid, 0); // last_hash
-  w0[1] = packv (tmps, last_hash, gid, 1);
-  w0[2] = packv (tmps, last_hash, gid, 2);
-  w0[3] = packv (tmps, last_hash, gid, 3);
-  w1[0] = packv (tmps, last_hash, gid, 4);
-  w1[1] = packv (tmps, last_hash, gid, 5);
-  w1[2] = packv (tmps, last_hash, gid, 6);
-  w1[3] = packv (tmps, last_hash, gid, 7);
-  w2[0] = packv (tmps, init_hash, gid, 0); // init_hash
-  w2[1] = packv (tmps, init_hash, gid, 1);
-  w2[2] = packv (tmps, init_hash, gid, 2);
-  w2[3] = packv (tmps, init_hash, gid, 3);
-  w3[0] = packv (tmps, init_hash, gid, 4);
-  w3[1] = packv (tmps, init_hash, gid, 5);
-  w3[2] = packv (tmps, init_hash, gid, 6);
-  w3[3] = packv (tmps, init_hash, gid, 7);
+  #ifdef REAL_SHM
+  LOCAL_VK u32 s_wb_ke_pc[FIXED_ITER_COUNT][48];
+  #else
+  GLOBAL_AS u32 (*s_wb_ke_pc)[48] = NULL;
+  #endif
+
+  #ifdef REAL_SHM
+
+  for (u32 i = lid; i < FIXED_ITER_COUNT; i += lsz)
+  {
+    for (u32 j = 0; j < 48; j++) // first 16 set to register
+    {
+      s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + i][j];
+    }
+  }
+
+  SYNC_THREADS ();
+
+  #else
+
+  s_wb_ke_pc = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos];
+
+  #endif
 
   // main loop
 
-  for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++)
+  for (u32 i = 0, j = loop_pos; i < FIXED_ITER_COUNT; i++, j++)
   {
     u32x digest[8];
 
@@ -305,6 +315,60 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
     w1[3] = digest[7];
   }
 
+  #ifdef IS_CUDA
+  // nothing to do
+  #else
+  // remaining 128 iterations for non-cuda devices
+  #ifdef REAL_SHM
+
+  for (u32 i = lid; i < FIXED_ITER_COUNT; i += lsz)
+  {
+    for (u32 j = 0; j < 48; j++) // first 16 set to register
+    {
+      s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + 128 + i][j];
+    }
+  }
+
+  SYNC_THREADS ();
+
+  #else
+
+  s_wb_ke_pc = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos + 128];
+
+  #endif
+
+  // main loop
+
+  for (u32 i = 0, j = loop_pos + 128; i < FIXED_ITER_COUNT; i++, j++)
+  {
+    u32x digest[8];
+
+    digest[0] = SHA256M_A;
+    digest[1] = SHA256M_B;
+    digest[2] = SHA256M_C;
+    digest[3] = SHA256M_D;
+    digest[4] = SHA256M_E;
+    digest[5] = SHA256M_F;
+    digest[6] = SHA256M_G;
+    digest[7] = SHA256M_H;
+
+    sha256_transform_vector (w0, w1, w2, w3, digest);
+
+    t1[0] = hc_swap32_S (j); // only moving part
+
+    sha256_transform_vector_pc (t0, t1, t2, t3, digest, s_wb_ke_pc[i]);
+
+    w0[0] = digest[0];
+    w0[1] = digest[1];
+    w0[2] = digest[2];
+    w0[3] = digest[3];
+    w1[0] = digest[4];
+    w1[1] = digest[5];
+    w1[2] = digest[6];
+    w1[3] = digest[7];
+  }
+  #endif
+
   unpackv (tmps, last_hash, gid, 0, w0[0]);
   unpackv (tmps, last_hash, gid, 1, w0[1]);
   unpackv (tmps, last_hash, gid, 2, w0[2]);