PDF Kernel (10700): Improved performance on AMD GPU by using shared memory for the scratch buffer and disable inlining to save spilling

Inspired by https://github.com/reger-men/hashcat/blob/6.2.4/OpenCL/m10700-optimized.cl
2025-06-25 01:18:57 +00:00 · 2021-10-31 10:05:58 +01:00 · 2021-10-31 10:05:58 +01:00 · aee8e559c4
commit aee8e559c4
parent 1d33b57144
3 changed files with 13 additions and 2 deletions
--- a/OpenCL/inc_vendor.h
+++ b/OpenCL/inc_vendor.h
@ -130,6 +130,17 @@
 #define DECLSPEC
 #endif

+#define INLINE0 __attribute__ ((noinline))
+#define INLINE1 __attribute__ ((inline))
+
+#if defined IS_AMD && defined IS_GPU
+#define INLINE INLINE0
+#elif defined IS_HIP
+#define INLINE INLINE0
+#else
+#define INLINE
+#endif
+
 /**
 * AMD specific
 */
--- a/OpenCL/m10700-optimized.cl
+++ b/OpenCL/m10700-optimized.cl
@ -315,7 +315,7 @@ DECLSPEC void make_w_with_offset (ctx_t *ctx, const u32 W_len, const u32 offset,
  }
 }

-DECLSPEC u32 do_round (LOCAL_AS u32 *sc, const u32 *pw, const u32 pw_len, ctx_t *ctx, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
+DECLSPEC INLINE u32 do_round (LOCAL_AS u32 *sc, const u32 *pw, const u32 pw_len, ctx_t *ctx, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
 {
  // make scratch buffer

--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -13,7 +13,7 @@
 ## Performance
 ##

- PDF Kernel (10700): Improved performance on AMD GPU by using shared memory for the scratch buffer
+- PDF Kernel (10700): Improved performance on AMD GPU by using shared memory for the scratch buffer and disable inlining to save spilling

 ##
 ## Bugs