From f8c08996700a812a789c27fde805598940c6429b Mon Sep 17 00:00:00 2001
From: fse-a
Date: Thu, 25 Jan 2024 10:27:38 +0100
Subject: [PATCH 1/2] Increased-virtual-backend-limit
Increased the virtual backend limit.
---
include/common.h | 2 +-
include/types.h | 2 +-
src/backend.c | 60 ++++++++++++++++++++++++++----------------------
3 files changed, 35 insertions(+), 29 deletions(-)
diff --git a/include/common.h b/include/common.h
index 8fdd49bcb..a744d532a 100644
--- a/include/common.h
+++ b/include/common.h
@@ -131,7 +131,7 @@ but this is needed for VS compiler which doesn't have inline keyword but has __i
#define CPT_CACHE 0x20000
#define PARAMCNT 64
-#define DEVICES_MAX 128
+#define DEVICES_MAX 256
#define EXEC_CACHE 128
#define SPEED_CACHE 4096
#define SPEED_MAXAGE 4096
diff --git a/include/types.h b/include/types.h
index df1e97118..e6ea946f9 100644
--- a/include/types.h
+++ b/include/types.h
@@ -1901,7 +1901,7 @@ typedef struct backend_ctx
int opencl_devices_cnt;
int opencl_devices_active;
- u64 backend_devices_filter;
+ bool backend_devices_filter[DEVICES_MAX + 1];
hc_device_param_t *devices_param;
diff --git a/src/backend.c b/src/backend.c
index 6137d2767..f7c916e1d 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -157,13 +157,10 @@ static int backend_ctx_find_alias_devices (hashcat_ctx_t *hashcat_ctx)
// show a warning for specifically listed devices if they are an alias
- if (backend_ctx->backend_devices_filter != (u64) -1)
+ if (backend_ctx->backend_devices_filter[alias_device->device_id])
{
- if (backend_ctx->backend_devices_filter & (1ULL << alias_device->device_id))
- {
- event_log_warning (hashcat_ctx, "The device #%d specifically listed was skipped because it is an alias of device #%d", alias_device->device_id + 1, backend_device->device_id + 1);
- event_log_warning (hashcat_ctx, NULL);
- }
+ event_log_warning (hashcat_ctx, "The device #%d specifically listed was skipped because it is an alias of device #%d", alias_device->device_id + 1, backend_device->device_id + 1);
+ event_log_warning (hashcat_ctx, NULL);
}
}
}
@@ -273,9 +270,9 @@ static int ocl_check_dri (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx)
return 0;
}
-static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char *backend_devices, u64 *out)
+static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char *backend_devices, bool *out)
{
- u64 backend_devices_filter = 0;
+ bool backend_devices_filter[DEVICES_MAX + 1] = {false};
if (backend_devices)
{
@@ -291,7 +288,7 @@ static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char
{
const int backend_device_id = (const int) strtol (next, NULL, 10);
- if ((backend_device_id <= 0) || (backend_device_id >= 64))
+ if ((backend_device_id <= 0) || (backend_device_id >= DEVICES_MAX))
{
event_log_error (hashcat_ctx, "Invalid device_id %d specified.", backend_device_id);
@@ -300,7 +297,7 @@ static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char
return false;
}
- backend_devices_filter |= 1ULL << (backend_device_id - 1);
+ backend_devices_filter[backend_device_id - 1] = true;
} while ((next = strtok_r ((char *) NULL, ",", &saveptr)) != NULL);
@@ -308,10 +305,16 @@ static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char
}
else
{
- backend_devices_filter = -1ULL;
+ for (int i = 0; i <= DEVICES_MAX; i++)
+ {
+ backend_devices_filter[i] = true;
+ }
}
- *out = backend_devices_filter;
+ for (int i = 0; i <= DEVICES_MAX; i++)
+ {
+ out[i] = backend_devices_filter[i];
+ }
return true;
}
@@ -4613,11 +4616,11 @@ int backend_ctx_init (hashcat_ctx_t *hashcat_ctx)
* Backend device selection
*/
- u64 backend_devices_filter;
+ bool backend_devices_filter[DEVICES_MAX + 1];
- if (setup_backend_devices_filter (hashcat_ctx, user_options->backend_devices, &backend_devices_filter) == false) return -1;
+ if (setup_backend_devices_filter (hashcat_ctx, user_options->backend_devices, backend_devices_filter) == false) return -1;
- backend_ctx->backend_devices_filter = backend_devices_filter;
+ for (int i = 0; i <= DEVICES_MAX; i++) backend_ctx->backend_devices_filter[i] = backend_devices_filter[i];
/**
* OpenCL device type selection
@@ -5276,7 +5279,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
// skipped
- if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+ if (!backend_ctx->backend_devices_filter[device_id])
{
device_param->skipped = true;
}
@@ -5693,7 +5696,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
// skipped
- if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+ if (!backend_ctx->backend_devices_filter[device_id])
{
device_param->skipped = true;
}
@@ -6190,7 +6193,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
// skipped
- if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+ if (!backend_ctx->backend_devices_filter[device_id])
{
device_param->skipped = true;
}
@@ -6989,7 +6992,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
// skipped
- if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0)
+ if (!backend_ctx->backend_devices_filter[device_id])
{
device_param->skipped = true;
}
@@ -7592,7 +7595,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
if (device_param->skipped == false)
{
- if (backend_ctx->backend_devices_filter == -1ULL)
+ if (backend_ctx->backend_devices_filter[DEVICES_MAX])
{
if ((user_options->quiet == false) && (user_options->backend_info == 0))
{
@@ -7605,7 +7608,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
}
else
{
- if (backend_ctx->backend_devices_filter & (1ULL << device_param->device_id))
+ if (backend_ctx->backend_devices_filter[device_param->device_id])
{
// ok
}
@@ -7661,7 +7664,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
// additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
- if (backend_ctx->backend_devices_cnt >= 64)
+ if (backend_ctx->backend_devices_cnt >= DEVICES_MAX)
{
event_log_error (hashcat_ctx, "Illegal use of the --backend-devices parameter because too many backend devices were found (%u).", backend_ctx->backend_devices_cnt);
event_log_error (hashcat_ctx, "If possible, disable one of your backends to reduce the number of backend devices. For example \"--backend-ignore-cuda\" or \"--backend-ignore-opencl\" .");
@@ -7669,16 +7672,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
return -1;
}
- if (backend_ctx->backend_devices_filter != (u64) -1)
+ if (!backend_ctx->backend_devices_filter[DEVICES_MAX])
{
const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
- if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask)
+ for (int i = backend_ctx->backend_devices_cnt; i < DEVICES_MAX; i++)
{
- event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
- event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
+ if (backend_ctx->backend_devices_filter[i])
+ {
+ event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
+ event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
- return -1;
+ return -1;
+ }
}
}
From 47509b2954e99f838c3ce20da86b47de49088ad1 Mon Sep 17 00:00:00 2001
From: fse-a
Date: Thu, 15 Feb 2024 15:17:29 +0100
Subject: [PATCH 2/2] Improve performance of scrypt-based algorithms by code
reordering.
---
OpenCL/m08900-pure.cl | 91 +++++++++++++++++++++++++++---------------
OpenCL/m15700-pure.cl | 92 ++++++++++++++++++++++++++++---------------
OpenCL/m22700-pure.cl | 91 +++++++++++++++++++++++++++---------------
OpenCL/m27700-pure.cl | 91 +++++++++++++++++++++++++++---------------
OpenCL/m28200-pure.cl | 92 ++++++++++++++++++++++++++++---------------
5 files changed, 302 insertions(+), 155 deletions(-)
diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl
index 3064263a2..b74b3d9c6 100644
--- a/OpenCL/m08900-pure.cl
+++ b/OpenCL/m08900-pure.cl
@@ -128,6 +128,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -165,36 +195,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -217,6 +217,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -459,10 +483,14 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -510,6 +538,7 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl
index e500b4f70..d435883ce 100644
--- a/OpenCL/m15700-pure.cl
+++ b/OpenCL/m15700-pure.cl
@@ -135,6 +135,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -172,36 +202,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -224,6 +224,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -595,10 +619,15 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -646,6 +675,7 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl
index a29df1c03..303e5e334 100644
--- a/OpenCL/m22700-pure.cl
+++ b/OpenCL/m22700-pure.cl
@@ -176,6 +176,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -213,36 +243,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -265,6 +265,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -597,10 +621,14 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
u32 w2[4];
u32 w3[4];
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -648,6 +676,7 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m27700-pure.cl b/OpenCL/m27700-pure.cl
index c62dc90d6..d9bf11510 100644
--- a/OpenCL/m27700-pure.cl
+++ b/OpenCL/m27700-pure.cl
@@ -126,6 +126,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -163,36 +193,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@@ -215,6 +215,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -549,10 +573,14 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
u32 w2[4];
u32 w3[4];
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -600,6 +628,7 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;
diff --git a/OpenCL/m28200-pure.cl b/OpenCL/m28200-pure.cl
index 2260e931b..58106a007 100644
--- a/OpenCL/m28200-pure.cl
+++ b/OpenCL/m28200-pure.cl
@@ -138,6 +138,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4 / 2];
+
+ for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TT[dst_off + 0] = TI[src_off + 0];
+ TT[dst_off + 1] = TI[src_off + 1];
+ TT[dst_off + 2] = TI[src_off + 2];
+ TT[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
+ {
+ TI[dst_off + 0] = TI[src_off + 0];
+ TI[dst_off + 1] = TI[src_off + 1];
+ TI[dst_off + 2] = TI[src_off + 2];
+ TI[dst_off + 3] = TI[src_off + 3];
+ }
+
+ for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
+ {
+ TI[dst_off + 0] = TT[src_off + 0];
+ TI[dst_off + 1] = TT[src_off + 1];
+ TI[dst_off + 2] = TT[src_off + 2];
+ TI[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@@ -175,38 +205,9 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
-
- #if SCRYPT_R > 1
-
- uint4 TT[STATE_CNT4 / 2];
-
- for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TT[dst_off + 0] = TI[src_off + 0];
- TT[dst_off + 1] = TI[src_off + 1];
- TT[dst_off + 2] = TI[src_off + 2];
- TT[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
- {
- TI[dst_off + 0] = TI[src_off + 0];
- TI[dst_off + 1] = TI[src_off + 1];
- TI[dst_off + 2] = TI[src_off + 2];
- TI[dst_off + 3] = TI[src_off + 3];
- }
-
- for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
- {
- TI[dst_off + 0] = TT[src_off + 0];
- TI[dst_off + 1] = TT[src_off + 1];
- TI[dst_off + 2] = TT[src_off + 2];
- TI[dst_off + 3] = TT[src_off + 3];
- }
-
- #endif
}
+
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
{
const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO;
@@ -227,6 +228,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
+ #if SCRYPT_R > 1
+
+ uint4 TT[STATE_CNT4];
+
+ for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
+
+ for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
+ {
+ X[dst_off + 0] = TT[src_off + 0];
+ X[dst_off + 1] = TT[src_off + 1];
+ X[dst_off + 2] = TT[src_off + 2];
+ X[dst_off + 3] = TT[src_off + 3];
+ }
+
+ #endif
+
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@@ -517,10 +542,14 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
- for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
+ for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
+ for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
+ {
uint4 X[4];
+ const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
+
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@@ -568,6 +597,7 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
w3[3] = T[3].w;
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
+ }
}
w0[0] = 1;