From 1d33b5714474439b2bd81f882ef02ca947b253fb Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sat, 30 Oct 2021 20:16:45 +0200
Subject: [PATCH] PDF 1.7 Kernel: Improved performance on AMD GPU by using
 shared memory for the scratch buffer Inspired by
 https://github.com/reger-men/hashcat/blob/6.2.4/OpenCL/m10700-optimized.cl

---
 OpenCL/m10700-optimized.cl | 16 ++++++++--------
 docs/changes.txt           |  6 ++++++
 src/modules/module_10700.c |  9 ++++++++-
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/OpenCL/m10700-optimized.cl b/OpenCL/m10700-optimized.cl
index 9779c8fe6..8450f0480 100644
--- a/OpenCL/m10700-optimized.cl
+++ b/OpenCL/m10700-optimized.cl
@@ -211,7 +211,7 @@ DECLSPEC void orig_sha512_transform (const u64 *w0, const u64 *w1, const u64 *w2
 #define WORDMAXSZ4  (WORDMAXSZ  / 4)
 #define AESSZ4      (AESSZ      / 4)
 
-DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, const u32 bl_len)
+DECLSPEC void make_sc (LOCAL_AS u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, const u32 bl_len)
 {
   const u32 bd = bl_len / 4;
 
@@ -263,7 +263,7 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl,
   }
 }
 
-DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, const u32 pwbl_len)
+DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, LOCAL_AS const u32 *sc, const u32 pwbl_len)
 {
   const u32 m = offset % pwbl_len;
 
@@ -293,7 +293,7 @@ DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, con
   #endif
 }
 
-DECLSPEC void make_w_with_offset (ctx_t *ctx, const u32 W_len, const u32 offset, const u32 *sc, const u32 pwbl_len, u32 *iv, const u32 *ks, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
+DECLSPEC void make_w_with_offset (ctx_t *ctx, const u32 W_len, const u32 offset, LOCAL_AS const u32 *sc, const u32 pwbl_len, u32 *iv, const u32 *ks, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
 {
   for (u32 k = 0, wk = 0; k < W_len; k += AESSZ, wk += AESSZ4)
   {
@@ -315,12 +315,10 @@ DECLSPEC void make_w_with_offset (ctx_t *ctx, const u32 W_len, const u32 offset,
   }
 }
 
-DECLSPEC u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
+DECLSPEC u32 do_round (LOCAL_AS u32 *sc, const u32 *pw, const u32 pw_len, ctx_t *ctx, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
 {
   // make scratch buffer
 
-  u32 sc[PWMAXSZ4 + BLMAXSZ4 + AESSZ4];
-
   make_sc (sc, pw, pw_len, ctx->dgst32, ctx->dgst_len);
 
   // make sure pwbl_len is calculcated before it gets changed
@@ -668,18 +666,20 @@ KERNEL_FQ void m10700_loop (KERN_ATTR_TMPS_ESALT (pdf17l8_tmp_t, pdf_t))
   ctx.dgst_len  = tmps[gid].dgst_len;
   ctx.W_len     = tmps[gid].W_len;
 
+  LOCAL_VK u32 s_sc[256][PWMAXSZ4 + BLMAXSZ4 + AESSZ4];
+
   u32 ex = 0;
 
   for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++)
   {
-    ex = do_round (w0, pw_len, &ctx, s_te0, s_te1, s_te2, s_te3, s_te4);
+    ex = do_round (s_sc[lid], w0, pw_len, &ctx, s_te0, s_te1, s_te2, s_te3, s_te4);
   }
 
   if ((loop_pos + loop_cnt) == 64)
   {
     for (u32 i = 64; i < (ex & 0xff) + 32; i++)
     {
-      ex = do_round (w0, pw_len, &ctx, s_te0, s_te1, s_te2, s_te3, s_te4);
+      ex = do_round (s_sc[lid], w0, pw_len, &ctx, s_te0, s_te1, s_te2, s_te3, s_te4);
     }
   }
 
diff --git a/docs/changes.txt b/docs/changes.txt
index 24980e1ff..ef648bef2 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -9,6 +9,12 @@
 - Added hash-mode: MultiBit Classic .wallet (scrypt)
 - Added hash-mode: MurmurHash3
 
+##
+## Performance
+##
+
+- PDF Kernel (10700): Improved performance on AMD GPU by using shared memory for the scratch buffer
+
 ##
 ## Bugs
 ##
diff --git a/src/modules/module_10700.c b/src/modules/module_10700.c
index a8c2a9fbe..06842e5f3 100644
--- a/src/modules/module_10700.c
+++ b/src/modules/module_10700.c
@@ -108,6 +108,13 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
+u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_max = 256;
+
+  return kernel_threads_max;
+}
+
 char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
   char *jit_build_options = NULL;
@@ -360,7 +367,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;