From e117e750fcbd37d21942b1a73e5158820460aedf Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Wed, 21 Nov 2018 13:00:30 +0100 Subject: [PATCH] Add restrict keyword to kernel declarations to help the compiler with caching optimizations --- OpenCL/amp_a0.cl | 2 +- OpenCL/amp_a1.cl | 2 +- OpenCL/amp_a3.cl | 2 +- OpenCL/inc_common.cl | 96 ++++++++++++++++++++-------------------- OpenCL/m01500_a3-pure.cl | 2 +- OpenCL/m03000_a3-pure.cl | 2 +- OpenCL/m14000_a3-pure.cl | 2 +- OpenCL/markov_be.cl | 6 +-- OpenCL/markov_le.cl | 6 +-- 9 files changed, 60 insertions(+), 60 deletions(-) diff --git a/OpenCL/amp_a0.cl b/OpenCL/amp_a0.cl index 9f8a5cb3f..32f1fc4f7 100644 --- a/OpenCL/amp_a0.cl +++ b/OpenCL/amp_a0.cl @@ -9,7 +9,7 @@ #include "inc_rp.h" #include "inc_rp.cl" -__kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, const u32 combs_mode, const u64 gid_max) +__kernel void amp (__global pw_t * restrict pws, __global pw_t * restrict pws_amp, __constant const kernel_rule_t * restrict rules_buf, __global const pw_t * restrict combs_buf, __global const bf_t * restrict bfs_buf, const u32 combs_mode, const u64 gid_max) { const u64 gid = get_global_id (0); diff --git a/OpenCL/amp_a1.cl b/OpenCL/amp_a1.cl index ec8db8275..68dc8000a 100644 --- a/OpenCL/amp_a1.cl +++ b/OpenCL/amp_a1.cl @@ -7,7 +7,7 @@ #include "inc_types.cl" #include "inc_common.cl" -__kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, const u32 combs_mode, const u64 gid_max) +__kernel void amp (__global pw_t * restrict pws, __global pw_t * restrict pws_amp, __global const kernel_rule_t * restrict rules_buf, __global const pw_t * restrict combs_buf, __global const bf_t * restrict bfs_buf, const u32 combs_mode, const u64 gid_max) { const u64 gid = get_global_id (0); diff --git a/OpenCL/amp_a3.cl b/OpenCL/amp_a3.cl index e2b5d9095..33282f977 100644 --- a/OpenCL/amp_a3.cl +++ b/OpenCL/amp_a3.cl @@ -7,7 +7,7 @@ #include "inc_vendor.cl" #include "inc_types.cl" -__kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const bf_t *bfs_buf, const u32 combs_mode, const u64 gid_max) +__kernel void amp (__global pw_t * restrict pws, __global pw_t * restrict pws_amp, __global const kernel_rule_t * restrict rules_buf, __global const pw_t * restrict combs_buf, __constant const bf_t * restrict bfs_buf, const u32 combs_mode, const u64 gid_max) { const u64 gid = get_global_id (0); diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index e48829cfa..02db55a93 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -23,41 +23,41 @@ * - P19: Type of the esalt_bufs structure with additional data, or void. */ -#define KERN_ATTR(p2,p4,p5,p6,p19) \ - __global pw_t *pws, \ - p2 const kernel_rule_t *rules_buf, \ - __global const pw_t *combs_buf, \ - p4, \ - __global p5 *tmps, \ - __global p6 *hooks, \ - __global const u32 *bitmaps_buf_s1_a, \ - __global const u32 *bitmaps_buf_s1_b, \ - __global const u32 *bitmaps_buf_s1_c, \ - __global const u32 *bitmaps_buf_s1_d, \ - __global const u32 *bitmaps_buf_s2_a, \ - __global const u32 *bitmaps_buf_s2_b, \ - __global const u32 *bitmaps_buf_s2_c, \ - __global const u32 *bitmaps_buf_s2_d, \ - __global plain_t *plains_buf, \ - __global const digest_t *digests_buf, \ - __global u32 *hashes_shown, \ - __global const salt_t *salt_bufs, \ - __global const p19 *esalt_bufs, \ - __global u32 *d_return_buf, \ - __global uint4 *d_scryptV0_buf, \ - __global uint4 *d_scryptV1_buf, \ - __global uint4 *d_scryptV2_buf, \ - __global uint4 *d_scryptV3_buf, \ - const u32 bitmap_mask, \ - const u32 bitmap_shift1, \ - const u32 bitmap_shift2, \ - const u32 salt_pos, \ - const u32 loop_pos, \ - const u32 loop_cnt, \ - const u32 il_cnt, \ - const u32 digests_cnt, \ - const u32 digests_offset, \ - const u32 combs_mode, \ +#define KERN_ATTR(p2,p4,p5,p6,p19) \ + __global pw_t * restrict pws, \ + p2 const kernel_rule_t * restrict rules_buf, \ + __global const pw_t * restrict combs_buf, \ + p4, \ + __global p5 * restrict tmps, \ + __global p6 * restrict hooks, \ + __global const u32 * restrict bitmaps_buf_s1_a, \ + __global const u32 * restrict bitmaps_buf_s1_b, \ + __global const u32 * restrict bitmaps_buf_s1_c, \ + __global const u32 * restrict bitmaps_buf_s1_d, \ + __global const u32 * restrict bitmaps_buf_s2_a, \ + __global const u32 * restrict bitmaps_buf_s2_b, \ + __global const u32 * restrict bitmaps_buf_s2_c, \ + __global const u32 * restrict bitmaps_buf_s2_d, \ + __global plain_t * restrict plains_buf, \ + __global const digest_t * restrict digests_buf, \ + __global u32 * restrict hashes_shown, \ + __global const salt_t * restrict salt_bufs, \ + __global const p19 * restrict esalt_bufs, \ + __global u32 * restrict d_return_buf, \ + __global uint4 * restrict d_scryptV0_buf, \ + __global uint4 * restrict d_scryptV1_buf, \ + __global uint4 * restrict d_scryptV2_buf, \ + __global uint4 * restrict d_scryptV3_buf, \ + const u32 bitmap_mask, \ + const u32 bitmap_shift1, \ + const u32 bitmap_shift2, \ + const u32 salt_pos, \ + const u32 loop_pos, \ + const u32 loop_cnt, \ + const u32 il_cnt, \ + const u32 digests_cnt, \ + const u32 digests_offset, \ + const u32 combs_mode, \ const u64 gid_max /* @@ -68,16 +68,16 @@ * do not use rules or tmps, etc. */ -#define KERN_ATTR_BASIC() KERN_ATTR (__global, __global const bf_t *bfs_buf, void, void, void) -#define KERN_ATTR_BITSLICE() KERN_ATTR (__global, __constant const bs_word_t *words_buf_r, void, void, void) -#define KERN_ATTR_ESALT(e) KERN_ATTR (__global, __global const bf_t *bfs_buf, void, void, e) -#define KERN_ATTR_RULES() KERN_ATTR (__constant, __global const bf_t *bfs_buf, void, void, void) -#define KERN_ATTR_RULES_ESALT(e) KERN_ATTR (__constant, __global const bf_t *bfs_buf, void, void, e) -#define KERN_ATTR_TMPS(t) KERN_ATTR (__global, __global const bf_t *bfs_buf, t, void, void) -#define KERN_ATTR_TMPS_ESALT(t,e) KERN_ATTR (__global, __global const bf_t *bfs_buf, t, void, e) -#define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (__global, __global const bf_t *bfs_buf, t, h, void) -#define KERN_ATTR_VECTOR() KERN_ATTR (__global, __constant const u32x *words_buf_r, void, void, void) -#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (__global, __constant const u32x *words_buf_r, void, void, e) +#define KERN_ATTR_BASIC() KERN_ATTR (__global, __global const bf_t * restrict bfs_buf, void, void, void) +#define KERN_ATTR_BITSLICE() KERN_ATTR (__global, __constant const bs_word_t * restrict words_buf_r, void, void, void) +#define KERN_ATTR_ESALT(e) KERN_ATTR (__global, __global const bf_t * restrict bfs_buf, void, void, e) +#define KERN_ATTR_RULES() KERN_ATTR (__constant, __global const bf_t * restrict bfs_buf, void, void, void) +#define KERN_ATTR_RULES_ESALT(e) KERN_ATTR (__constant, __global const bf_t * restrict bfs_buf, void, void, e) +#define KERN_ATTR_TMPS(t) KERN_ATTR (__global, __global const bf_t * restrict bfs_buf, t, void, void) +#define KERN_ATTR_TMPS_ESALT(t,e) KERN_ATTR (__global, __global const bf_t * restrict bfs_buf, t, void, e) +#define KERN_ATTR_TMPS_HOOKS(t,h) KERN_ATTR (__global, __global const bf_t * restrict bfs_buf, t, h, void) +#define KERN_ATTR_VECTOR() KERN_ATTR (__global, __constant const u32x * restrict words_buf_r, void, void, void) +#define KERN_ATTR_VECTOR_ESALT(e) KERN_ATTR (__global, __constant const u32x * restrict words_buf_r, void, void, e) /** * pure scalar functions @@ -60480,7 +60480,7 @@ DECLSPEC void gpu_decompress_entry (__global pw_idx_t *pws_idx, __global u32 *pw pw->pw_len = len; } -__kernel void gpu_decompress (__global pw_idx_t *pws_idx, __global u32 *pws_comp, __global pw_t *pws_buf, const u64 gid_max) +__kernel void gpu_decompress (__global pw_idx_t * restrict pws_idx, __global u32 * restrict pws_comp, __global pw_t * restrict pws_buf, const u64 gid_max) { const u64 gid = get_global_id (0); @@ -60493,7 +60493,7 @@ __kernel void gpu_decompress (__global pw_idx_t *pws_idx, __global u32 *pws_comp pws_buf[gid] = pw; } -__kernel void gpu_memset (__global uint4 *buf, const u32 value, const u64 gid_max) +__kernel void gpu_memset (__global uint4 * restrict buf, const u32 value, const u64 gid_max) { const u64 gid = get_global_id (0); @@ -60502,7 +60502,7 @@ __kernel void gpu_memset (__global uint4 *buf, const u32 value, const u64 gid_ma buf[gid] = (uint4) (value); } -__kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max) +__kernel void gpu_atinit (__global pw_t * restrict buf, const u64 gid_max) { const u64 gid = get_global_id (0); diff --git a/OpenCL/m01500_a3-pure.cl b/OpenCL/m01500_a3-pure.cl index 7880bdc41..3486ba268 100644 --- a/OpenCL/m01500_a3-pure.cl +++ b/OpenCL/m01500_a3-pure.cl @@ -1885,7 +1885,7 @@ DECLSPEC void transpose32c (u32 *data) // transpose bitslice mod : attention race conditions, need different buffers for *in and *out // -__kernel void m01500_tm (__global u32 *mod, __global bs_word_t *words_buf_r) +__kernel void m01500_tm (__global u32 * restrict mod, __global bs_word_t * restrict words_buf_r) { const u64 gid = get_global_id (0); diff --git a/OpenCL/m03000_a3-pure.cl b/OpenCL/m03000_a3-pure.cl index 586a79f0d..e434620c2 100644 --- a/OpenCL/m03000_a3-pure.cl +++ b/OpenCL/m03000_a3-pure.cl @@ -1730,7 +1730,7 @@ DECLSPEC void transpose32c (u32 *data) // transpose bitslice mod : attention race conditions, need different buffers for *in and *out // -__kernel void m03000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) +__kernel void m03000_tm (__global u32 * restrict mod, __global bs_word_t * restrict words_buf_r) { const u64 gid = get_global_id (0); diff --git a/OpenCL/m14000_a3-pure.cl b/OpenCL/m14000_a3-pure.cl index 9dfb01779..2dbd0b178 100644 --- a/OpenCL/m14000_a3-pure.cl +++ b/OpenCL/m14000_a3-pure.cl @@ -1730,7 +1730,7 @@ DECLSPEC void transpose32c (u32 *data) // transpose bitslice mod : attention race conditions, need different buffers for *in and *out // -__kernel void m14000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) +__kernel void m14000_tm (__global u32 * restrict mod, __global bs_word_t * restrict words_buf_r) { const u64 gid = get_global_id (0); diff --git a/OpenCL/markov_be.cl b/OpenCL/markov_be.cl index 63e3f9245..1d9dc0ff1 100644 --- a/OpenCL/markov_be.cl +++ b/OpenCL/markov_be.cl @@ -44,7 +44,7 @@ DECLSPEC void generate_pw (u32 *pw_buf, __global const cs_t *root_css_buf, __glo if (bits15) pw_buf[15] = (pw_l_len + pw_r_len) * 8; } -__kernel void l_markov (__global pw_t *pws_buf_l, __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u64 off, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) +__kernel void l_markov (__global pw_t * restrict pws_buf_l, __global const cs_t * restrict root_css_buf, __global const cs_t * restrict markov_css_buf, const u64 off, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) { const u64 gid = get_global_id (0); @@ -63,7 +63,7 @@ __kernel void l_markov (__global pw_t *pws_buf_l, __global const cs_t *root_css_ pws_buf_l[gid].pw_len = pw_l_len + pw_r_len; } -__kernel void r_markov (__global bf_t *pws_buf_r, __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u64 off, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) +__kernel void r_markov (__global bf_t * restrict pws_buf_r, __global const cs_t * restrict root_css_buf, __global const cs_t * restrict markov_css_buf, const u64 off, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) { const u64 gid = get_global_id (0); @@ -76,7 +76,7 @@ __kernel void r_markov (__global bf_t *pws_buf_r, __global const cs_t *root_css_ pws_buf_r[gid].i = pw_buf[0]; } -__kernel void C_markov (__global pw_t *pws_buf, __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u64 off, const u32 pw_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) +__kernel void C_markov (__global pw_t * restrict pws_buf, __global const cs_t * restrict root_css_buf, __global const cs_t * restrict markov_css_buf, const u64 off, const u32 pw_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) { const u64 gid = get_global_id (0); diff --git a/OpenCL/markov_le.cl b/OpenCL/markov_le.cl index 7f08971ed..d50c9f2d8 100644 --- a/OpenCL/markov_le.cl +++ b/OpenCL/markov_le.cl @@ -44,7 +44,7 @@ DECLSPEC void generate_pw (u32 *pw_buf, __global const cs_t *root_css_buf, __glo if (bits15) pw_buf[15] = (pw_l_len + pw_r_len) * 8; } -__kernel void l_markov (__global pw_t *pws_buf_l, __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u64 off, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) +__kernel void l_markov (__global pw_t * restrict pws_buf_l, __global const cs_t * restrict root_css_buf, __global const cs_t * restrict markov_css_buf, const u64 off, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) { const u64 gid = get_global_id (0); @@ -63,7 +63,7 @@ __kernel void l_markov (__global pw_t *pws_buf_l, __global const cs_t *root_css_ pws_buf_l[gid].pw_len = pw_l_len + pw_r_len; } -__kernel void r_markov (__global bf_t *pws_buf_r, __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u64 off, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) +__kernel void r_markov (__global bf_t * restrict pws_buf_r, __global const cs_t * restrict root_css_buf, __global const cs_t * restrict markov_css_buf, const u64 off, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) { const u64 gid = get_global_id (0); @@ -76,7 +76,7 @@ __kernel void r_markov (__global bf_t *pws_buf_r, __global const cs_t *root_css_ pws_buf_r[gid].i = pw_buf[0]; } -__kernel void C_markov (__global pw_t *pws_buf, __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u64 off, const u32 pw_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) +__kernel void C_markov (__global pw_t * restrict pws_buf, __global const cs_t * restrict root_css_buf, __global const cs_t * restrict markov_css_buf, const u64 off, const u32 pw_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max) { const u64 gid = get_global_id (0);