From b7aabe4cb4e4d46cd3f094d6fbba763eec291665 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sun, 13 Jul 2025 15:23:18 +0200 Subject: [PATCH] It's possible to implement the kernel entirely in registers, since all offsets are known throughout the algorithm's lifetime --- OpenCL/m12150-pure.cl | 177 ++++++++++++++++++++++++++---------------- 1 file changed, 111 insertions(+), 66 deletions(-) diff --git a/OpenCL/m12150-pure.cl b/OpenCL/m12150-pure.cl index 2768ee295..346442316 100644 --- a/OpenCL/m12150-pure.cl +++ b/OpenCL/m12150-pure.cl @@ -17,20 +17,18 @@ #define COMPARE_S M2S(INCLUDE_PATH/inc_comp_single.cl) #define COMPARE_M M2S(INCLUDE_PATH/inc_comp_multi.cl) -#define SHA512_DIGEST_LENGTH 64 - typedef struct shiro1_sha512_tmp { - u64 dgst[8]; - u64 out[8]; // Final output hash + u32 dgst[16]; + } shiro1_sha512_tmp_t; -KERNEL_FQ void m12150_init (KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) +KERNEL_FQ KERNEL_FA void m12150_init (KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) { const u32 gid = get_global_id (0); if (gid >= GID_CNT) return; - + sha512_ctx_t ctx; sha512_init (&ctx); @@ -41,75 +39,125 @@ KERNEL_FQ void m12150_init (KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) sha512_final (&ctx); - for (int i = 0; i < 8; i++) { - tmps[gid].dgst[i] = ctx.h[i]; - tmps[gid].out[i] = ctx.h[i]; - } + tmps[gid].dgst[ 0] = h32_from_64_S (ctx.h[0]); + tmps[gid].dgst[ 1] = l32_from_64_S (ctx.h[0]); + tmps[gid].dgst[ 2] = h32_from_64_S (ctx.h[1]); + tmps[gid].dgst[ 3] = l32_from_64_S (ctx.h[1]); + tmps[gid].dgst[ 4] = h32_from_64_S (ctx.h[2]); + tmps[gid].dgst[ 5] = l32_from_64_S (ctx.h[2]); + tmps[gid].dgst[ 6] = h32_from_64_S (ctx.h[3]); + tmps[gid].dgst[ 7] = l32_from_64_S (ctx.h[3]); + tmps[gid].dgst[ 8] = h32_from_64_S (ctx.h[4]); + tmps[gid].dgst[ 9] = l32_from_64_S (ctx.h[4]); + tmps[gid].dgst[10] = h32_from_64_S (ctx.h[5]); + tmps[gid].dgst[11] = l32_from_64_S (ctx.h[5]); + tmps[gid].dgst[12] = h32_from_64_S (ctx.h[6]); + tmps[gid].dgst[13] = l32_from_64_S (ctx.h[6]); + tmps[gid].dgst[14] = h32_from_64_S (ctx.h[7]); + tmps[gid].dgst[15] = l32_from_64_S (ctx.h[7]); } -KERNEL_FQ void m12150_loop(KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) { +KERNEL_FQ KERNEL_FA void m12150_loop(KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) +{ const u32 gid = get_global_id(0); if (gid >= GID_CNT) return; - sha512_ctx_t sha512_ctx; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + u32 w4[4]; + u32 w5[4]; + u32 w6[4]; + u32 w7[4]; - // Temporary buffer to hold the digest in u32 format - u32 digest_u32[16]; + w0[0] = tmps[gid].dgst[ 0]; + w0[1] = tmps[gid].dgst[ 1]; + w0[2] = tmps[gid].dgst[ 2]; + w0[3] = tmps[gid].dgst[ 3]; + w1[0] = tmps[gid].dgst[ 4]; + w1[1] = tmps[gid].dgst[ 5]; + w1[2] = tmps[gid].dgst[ 6]; + w1[3] = tmps[gid].dgst[ 7]; + w2[0] = tmps[gid].dgst[ 8]; + w2[1] = tmps[gid].dgst[ 9]; + w2[2] = tmps[gid].dgst[10]; + w2[3] = tmps[gid].dgst[11]; + w3[0] = tmps[gid].dgst[12]; + w3[1] = tmps[gid].dgst[13]; + w3[2] = tmps[gid].dgst[14]; + w3[3] = tmps[gid].dgst[15]; + w4[0] = 0x80000000; + w4[1] = 0; + w4[2] = 0; + w4[3] = 0; + w5[0] = 0; + w5[1] = 0; + w5[2] = 0; + w5[3] = 0; + w6[0] = 0; + w6[1] = 0; + w6[2] = 0; + w6[3] = 0; + w7[0] = 0; + w7[1] = 0; + w7[2] = 0; + w7[3] = 64 * 8; - // Convert u64 digest to u32 format manually - digest_u32[0] = h32_from_64_S(tmps[gid].dgst[0]); - digest_u32[1] = l32_from_64_S(tmps[gid].dgst[0]); - digest_u32[2] = h32_from_64_S(tmps[gid].dgst[1]); - digest_u32[3] = l32_from_64_S(tmps[gid].dgst[1]); - digest_u32[4] = h32_from_64_S(tmps[gid].dgst[2]); - digest_u32[5] = l32_from_64_S(tmps[gid].dgst[2]); - digest_u32[6] = h32_from_64_S(tmps[gid].dgst[3]); - digest_u32[7] = l32_from_64_S(tmps[gid].dgst[3]); - digest_u32[8] = h32_from_64_S(tmps[gid].dgst[4]); - digest_u32[9] = l32_from_64_S(tmps[gid].dgst[4]); - digest_u32[10] = h32_from_64_S(tmps[gid].dgst[5]); - digest_u32[11] = l32_from_64_S(tmps[gid].dgst[5]); - digest_u32[12] = h32_from_64_S(tmps[gid].dgst[6]); - digest_u32[13] = l32_from_64_S(tmps[gid].dgst[6]); - digest_u32[14] = h32_from_64_S(tmps[gid].dgst[7]); - digest_u32[15] = l32_from_64_S(tmps[gid].dgst[7]); + for (u32 i = 0; i < LOOP_CNT; i++) + { + u64 dgst[8]; - for (u32 i = 0; i < LOOP_CNT; i++) { - sha512_init (&sha512_ctx); - sha512_update (&sha512_ctx, digest_u32, SHA512_DIGEST_LENGTH); - sha512_final (&sha512_ctx); + dgst[0] = SHA512M_A; + dgst[1] = SHA512M_B; + dgst[2] = SHA512M_C; + dgst[3] = SHA512M_D; + dgst[4] = SHA512M_E; + dgst[5] = SHA512M_F; + dgst[6] = SHA512M_G; + dgst[7] = SHA512M_H; - for (int j = 0; j < 8; j++) { - tmps[gid].dgst[j] = sha512_ctx.h[j]; - } + sha512_transform (w0, w1, w2, w3, w4, w5, w6, w7, dgst); // Update the digest_u32 array for the next iteration - digest_u32[0] = h32_from_64_S(tmps[gid].dgst[0]); - digest_u32[1] = l32_from_64_S(tmps[gid].dgst[0]); - digest_u32[2] = h32_from_64_S(tmps[gid].dgst[1]); - digest_u32[3] = l32_from_64_S(tmps[gid].dgst[1]); - digest_u32[4] = h32_from_64_S(tmps[gid].dgst[2]); - digest_u32[5] = l32_from_64_S(tmps[gid].dgst[2]); - digest_u32[6] = h32_from_64_S(tmps[gid].dgst[3]); - digest_u32[7] = l32_from_64_S(tmps[gid].dgst[3]); - digest_u32[8] = h32_from_64_S(tmps[gid].dgst[4]); - digest_u32[9] = l32_from_64_S(tmps[gid].dgst[4]); - digest_u32[10] = h32_from_64_S(tmps[gid].dgst[5]); - digest_u32[11] = l32_from_64_S(tmps[gid].dgst[5]); - digest_u32[12] = h32_from_64_S(tmps[gid].dgst[6]); - digest_u32[13] = l32_from_64_S(tmps[gid].dgst[6]); - digest_u32[14] = h32_from_64_S(tmps[gid].dgst[7]); - digest_u32[15] = l32_from_64_S(tmps[gid].dgst[7]); + w0[0] = h32_from_64_S (dgst[0]); + w0[1] = l32_from_64_S (dgst[0]); + w0[2] = h32_from_64_S (dgst[1]); + w0[3] = l32_from_64_S (dgst[1]); + w1[0] = h32_from_64_S (dgst[2]); + w1[1] = l32_from_64_S (dgst[2]); + w1[2] = h32_from_64_S (dgst[3]); + w1[3] = l32_from_64_S (dgst[3]); + w2[0] = h32_from_64_S (dgst[4]); + w2[1] = l32_from_64_S (dgst[4]); + w2[2] = h32_from_64_S (dgst[5]); + w2[3] = l32_from_64_S (dgst[5]); + w3[0] = h32_from_64_S (dgst[6]); + w3[1] = l32_from_64_S (dgst[6]); + w3[2] = h32_from_64_S (dgst[7]); + w3[3] = l32_from_64_S (dgst[7]); } - // Store the final digest in the tmps buffer - for (int i = 0; i < 8; i++) { - tmps[gid].out[i] = sha512_ctx.h[i]; - } + tmps[gid].dgst[ 0] = w0[0]; + tmps[gid].dgst[ 1] = w0[1]; + tmps[gid].dgst[ 2] = w0[2]; + tmps[gid].dgst[ 3] = w0[3]; + tmps[gid].dgst[ 4] = w1[0]; + tmps[gid].dgst[ 5] = w1[1]; + tmps[gid].dgst[ 6] = w1[2]; + tmps[gid].dgst[ 7] = w1[3]; + tmps[gid].dgst[ 8] = w2[0]; + tmps[gid].dgst[ 9] = w2[1]; + tmps[gid].dgst[10] = w2[2]; + tmps[gid].dgst[11] = w2[3]; + tmps[gid].dgst[12] = w3[0]; + tmps[gid].dgst[13] = w3[1]; + tmps[gid].dgst[14] = w3[2]; + tmps[gid].dgst[15] = w3[3]; } -KERNEL_FQ void m12150_comp (KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) +KERNEL_FQ KERNEL_FA void m12150_comp (KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) { const u64 gid = get_global_id (0); @@ -117,13 +165,10 @@ KERNEL_FQ void m12150_comp (KERN_ATTR_TMPS (shiro1_sha512_tmp_t)) const u64 lid = get_local_id (0); - const u64 a = tmps[gid].out[0]; - const u64 b = tmps[gid].out[1]; - - const u32 r0 = l32_from_64_S (a); - const u32 r1 = h32_from_64_S (a); - const u32 r2 = l32_from_64_S (b); - const u32 r3 = h32_from_64_S (b); + const u32 r0 = tmps[gid].dgst[1]; + const u32 r1 = tmps[gid].dgst[0]; + const u32 r2 = tmps[gid].dgst[3]; + const u32 r3 = tmps[gid].dgst[2]; #define il_pos 0