From 47509b2954e99f838c3ce20da86b47de49088ad1 Mon Sep 17 00:00:00 2001
From: fse-a
Date: Thu, 15 Feb 2024 15:17:29 +0100
Subject: [PATCH 1/2] Improve performance of scrypt-based algorithms by code
reordering.
---
OpenCL/m08900-pure.cl | 91 +++++++++++++++++++++++++++---------------
OpenCL/m15700-pure.cl | 92 ++++++++++++++++++++++++++++---------------
OpenCL/m22700-pure.cl | 91 +++++++++++++++++++++++++++---------------
OpenCL/m27700-pure.cl | 91 +++++++++++++++++++++++++++---------------
OpenCL/m28200-pure.cl | 92 ++++++++++++++++++++++++++++---------------
5 files changed, 302 insertions(+), 155 deletions(-)
diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl
index 3064263a2..b74b3d9c6 100644
--- a/OpenCL/m08900-pure.cl
+++ b/OpenCL/m08900-pure.cl
@@ -128,6 +128,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -165,36 +195,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -217,6 +217,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -459,10 +483,14 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -510,6 +538,7 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl
index e500b4f70..d435883ce 100644
--- a/OpenCL/m15700-pure.cl
+++ b/OpenCL/m15700-pure.cl
@@ -135,6 +135,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -172,36 +202,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -224,6 +224,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -595,10 +619,15 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -646,6 +675,7 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl
index a29df1c03..303e5e334 100644
--- a/OpenCL/m22700-pure.cl
+++ b/OpenCL/m22700-pure.cl
@@ -176,6 +176,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -213,36 +243,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -265,6 +265,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -597,10 +621,14 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
u32 w2[4];
u32 w3[4];
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -648,6 +676,7 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m27700-pure.cl b/OpenCL/m27700-pure.cl
index c62dc90d6..d9bf11510 100644
--- a/OpenCL/m27700-pure.cl
+++ b/OpenCL/m27700-pure.cl
@@ -126,6 +126,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -163,36 +193,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -215,6 +215,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -549,10 +573,14 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
u32 w2[4];
u32 w3[4];
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -600,6 +628,7 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m28200-pure.cl b/OpenCL/m28200-pure.cl
index 2260e931b..58106a007 100644
--- a/OpenCL/m28200-pure.cl
+++ b/OpenCL/m28200-pure.cl
@@ -138,6 +138,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -175,38 +205,9 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
+
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
{
const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
@@ -227,6 +228,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -517,10 +542,14 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -568,6 +597,7 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
From 6716447dfce969ddde42a9abe0681500bee0df48 Mon Sep 17 00:00:00 2001
From: jsteube
Date: Sat, 20 Apr 2024 17:35:45 +0000
Subject: [PATCH 2/2] Add support for zero-length salts in Electrum $4 and $5
---
OpenCL/m21700-pure.cl | 14 +++++++++-----
OpenCL/m21800-pure.cl | 9 +++++++--
src/modules/module_21700.c | 17 +++++++++--------
src/modules/module_21800.c | 17 +++++++++--------
4 files changed, 34 insertions(+), 23 deletions(-)
diff --git a/OpenCL/m21700-pure.cl b/OpenCL/m21700-pure.cl
index 920f0efa5..554dcfbdf 100644
--- a/OpenCL/m21700-pure.cl
+++ b/OpenCL/m21700-pure.cl
@@ -325,6 +325,12 @@ KERNEL_FQ void m21700_comp (KERN_ATTR_TMPS_ESALT (electrum_tmp_t, electrum_t))
if (gid >= GID_CNT) return;
+ const u32 digest_pos = LOOP_POS;
+
+ const u32 digest_cur = DIGESTS_OFFSET_HOST + digest_pos;
+
+ GLOBAL_AS const electrum_t *electrum = &esalt_bufs[digest_cur];
+
u64 out[8];
out[0] = tmps[gid].out[0];
@@ -379,13 +385,12 @@ KERNEL_FQ void m21700_comp (KERN_ATTR_TMPS_ESALT (electrum_tmp_t, electrum_t))
* the main secp256k1 point multiplication by a scalar/tweak:
*/
- GLOBAL_AS secp256k1_t *coords = (GLOBAL_AS secp256k1_t *) &esalt_bufs[DIGESTS_OFFSET_HOST].coords;
+ GLOBAL_AS const secp256k1_t *coords = (GLOBAL_AS const secp256k1_t *) &electrum->coords;
u32 pubkey[64] = { 0 }; // for point_mul () we need: 1 + 32 bytes (for sha512 () we need more)
point_mul (pubkey, tweak, coords);
-
/*
* sha512 () of the pubkey:
*/
@@ -396,14 +401,13 @@ KERNEL_FQ void m21700_comp (KERN_ATTR_TMPS_ESALT (electrum_tmp_t, electrum_t))
sha512_update (&sha512_ctx, pubkey, 33); // 33 because of 32 byte curve point + sign
sha512_final (&sha512_ctx);
-
/*
* sha256-hmac () of the data_buf
*/
- GLOBAL_AS u32 *data_buf = (GLOBAL_AS u32 *) esalt_bufs[DIGESTS_OFFSET_HOST].data_buf;
+ GLOBAL_AS const u32 *data_buf = (GLOBAL_AS const u32 *) electrum->data_buf;
- u32 data_len = esalt_bufs[DIGESTS_OFFSET_HOST].data_len;
+ u32 data_len = electrum->data_len;
u32 key[16] = { 0 };
diff --git a/OpenCL/m21800-pure.cl b/OpenCL/m21800-pure.cl
index 9317eb19f..fd301671b 100644
--- a/OpenCL/m21800-pure.cl
+++ b/OpenCL/m21800-pure.cl
@@ -375,6 +375,11 @@ KERNEL_FQ void m21800_comp (KERN_ATTR_TMPS_ESALT (electrum_tmp_t, electrum_t))
if (gid >= GID_CNT) return;
+ const u32 digest_pos = LOOP_POS;
+
+ const u32 digest_cur = DIGESTS_OFFSET_HOST + digest_pos;
+
+ GLOBAL_AS const electrum_t *electrum = &esalt_bufs[digest_cur];
/*
* Start by copying/aligning the data
@@ -434,7 +439,7 @@ KERNEL_FQ void m21800_comp (KERN_ATTR_TMPS_ESALT (electrum_tmp_t, electrum_t))
* the main secp256k1 point multiplication by a scalar/tweak:
*/
- GLOBAL_AS secp256k1_t *coords = (GLOBAL_AS secp256k1_t *) &esalt_bufs[DIGESTS_OFFSET_HOST].coords;
+ GLOBAL_AS const secp256k1_t *coords = (GLOBAL_AS const secp256k1_t *) &electrum->coords;
u32 pubkey[64] = { 0 }; // for point_mul () we need: 1 + 32 bytes (for sha512 () we need more)
@@ -499,7 +504,7 @@ KERNEL_FQ void m21800_comp (KERN_ATTR_TMPS_ESALT (electrum_tmp_t, electrum_t))
// we need to run it at least once:
- GLOBAL_AS u32 *data_buf = (GLOBAL_AS u32 *) esalt_bufs[DIGESTS_OFFSET_HOST].data_buf;
+ GLOBAL_AS const u32 *data_buf = (GLOBAL_AS const u32 *) electrum->data_buf;
u32 data[4];
diff --git a/src/modules/module_21700.c b/src/modules/module_21700.c
index 7f138f5b9..ffd6178d9 100644
--- a/src/modules/module_21700.c
+++ b/src/modules/module_21700.c
@@ -25,6 +25,7 @@ static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE
| OPTI_TYPE_USES_BITS_64
| OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE
+ | OPTS_TYPE_DEEP_COMP_KERNEL
| OPTS_TYPE_PT_GENERATE_LE;
static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED;
static const char *ST_PASS = "hashcat";
@@ -66,6 +67,11 @@ typedef struct electrum_tmp
static const char *SIGNATURE_ELECTRUM = "$electrum$4*";
+u32 module_deep_comp_kernel (MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const u32 salt_pos, MAYBE_UNUSED const u32 digest_pos)
+{
+ return KERN_RUN_3;
+}
+
bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
{
if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
@@ -214,13 +220,8 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
// fake salt
- salt->salt_buf[0] = esalt->data_buf[0];
- salt->salt_buf[1] = esalt->data_buf[1];
- salt->salt_buf[2] = esalt->data_buf[2];
- salt->salt_buf[3] = esalt->data_buf[3];
-
- salt->salt_len = 16;
-
+ salt->salt_buf[0] = 0;
+ salt->salt_len = 0;
salt->salt_iter = 1024 - 1;
return (PARSER_OK);
@@ -294,7 +295,7 @@ void module_init (module_ctx_t *module_ctx)
module_ctx->module_benchmark_charset = MODULE_DEFAULT;
module_ctx->module_benchmark_salt = MODULE_DEFAULT;
module_ctx->module_build_plain_postprocess = MODULE_DEFAULT;
- module_ctx->module_deep_comp_kernel = MODULE_DEFAULT;
+ module_ctx->module_deep_comp_kernel = module_deep_comp_kernel;
module_ctx->module_deprecated_notice = MODULE_DEFAULT;
module_ctx->module_dgst_pos0 = module_dgst_pos0;
module_ctx->module_dgst_pos1 = module_dgst_pos1;
diff --git a/src/modules/module_21800.c b/src/modules/module_21800.c
index da26f0efa..486eba089 100644
--- a/src/modules/module_21800.c
+++ b/src/modules/module_21800.c
@@ -26,6 +26,7 @@ static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE
| OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE
| OPTS_TYPE_PT_GENERATE_LE
+ | OPTS_TYPE_DEEP_COMP_KERNEL
| OPTS_TYPE_NATIVE_THREADS;
static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED;
static const char *ST_PASS = "hashcat";
@@ -66,6 +67,11 @@ typedef struct electrum_tmp
static const char *SIGNATURE_ELECTRUM = "$electrum$5*";
+u32 module_deep_comp_kernel (MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const u32 salt_pos, MAYBE_UNUSED const u32 digest_pos)
+{
+ return KERN_RUN_3;
+}
+
bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
{
// problem with this kernel is the huge amount of register pressure on u8 tmp[TMPSIZ];
@@ -191,13 +197,8 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
// fake salt
- salt->salt_buf[0] = esalt->data_buf[0];
- salt->salt_buf[1] = esalt->data_buf[1];
- salt->salt_buf[2] = esalt->data_buf[2];
- salt->salt_buf[3] = esalt->data_buf[3];
-
- salt->salt_len = 16;
-
+ salt->salt_buf[0] = 0;
+ salt->salt_len = 0;
salt->salt_iter = 1024 - 1;
return (PARSER_OK);
@@ -271,7 +272,7 @@ void module_init (module_ctx_t *module_ctx)
module_ctx->module_benchmark_charset = MODULE_DEFAULT;
module_ctx->module_benchmark_salt = MODULE_DEFAULT;
module_ctx->module_build_plain_postprocess = MODULE_DEFAULT;
- module_ctx->module_deep_comp_kernel = MODULE_DEFAULT;
+ module_ctx->module_deep_comp_kernel = module_deep_comp_kernel;
module_ctx->module_deprecated_notice = MODULE_DEFAULT;
module_ctx->module_dgst_pos0 = module_dgst_pos0;
module_ctx->module_dgst_pos1 = module_dgst_pos1;