1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-11-21 23:58:07 +00:00

Merge pull request #3950 from fse-a/scrypt-performance-improvements

Improve performance of scrypt-based algorithms by code reordering.
This commit is contained in:
Jens Steube 2024-02-16 10:29:35 +01:00 committed by GitHub
commit fafb277e07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 302 additions and 155 deletions

View File

@ -128,6 +128,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@ -165,36 +195,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@ -217,6 +217,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4];
for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
#endif
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@ -459,10 +483,14 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
{
uint4 X[4];
const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@ -511,6 +539,7 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
}
}
w0[0] = 1;
w0[1] = 0;

View File

@ -135,6 +135,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@ -172,36 +202,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@ -224,6 +224,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4];
for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
#endif
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@ -595,10 +619,15 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
{
uint4 X[4];
const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@ -647,6 +676,7 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
}
}
w0[0] = 1;
w0[1] = 0;

View File

@ -176,6 +176,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@ -213,36 +243,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@ -265,6 +265,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4];
for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
#endif
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@ -597,10 +621,14 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
u32 w2[4];
u32 w3[4];
for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
{
uint4 X[4];
const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@ -649,6 +677,7 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
}
}
w0[0] = 1;
w0[1] = 0;

View File

@ -126,6 +126,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@ -163,36 +193,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
@ -215,6 +215,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4];
for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
#endif
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@ -549,10 +573,14 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
u32 w2[4];
u32 w3[4];
for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
{
uint4 X[4];
const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@ -601,6 +629,7 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
}
}
w0[0] = 1;
w0[1] = 0;

View File

@ -138,6 +138,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v)
DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
{
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
uint4 R0 = TI[STATE_CNT4 - 4];
uint4 R1 = TI[STATE_CNT4 - 3];
uint4 R2 = TI[STATE_CNT4 - 2];
@ -175,37 +205,8 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI)
TI[i + 2] = R2;
TI[i + 3] = R3;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4 / 2];
for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TT[dst_off + 0] = TI[src_off + 0];
TT[dst_off + 1] = TI[src_off + 1];
TT[dst_off + 2] = TI[src_off + 2];
TT[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8)
{
TI[dst_off + 0] = TI[src_off + 0];
TI[dst_off + 1] = TI[src_off + 1];
TI[dst_off + 2] = TI[src_off + 2];
TI[dst_off + 3] = TI[src_off + 3];
}
for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4)
{
TI[dst_off + 0] = TT[src_off + 0];
TI[dst_off + 1] = TT[src_off + 1];
TI[dst_off + 2] = TT[src_off + 2];
TI[dst_off + 3] = TT[src_off + 3];
}
#endif
}
DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid)
{
@ -227,6 +228,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
case 3: V = V3; break;
}
#if SCRYPT_R > 1
uint4 TT[STATE_CNT4];
for (int z = 0; z < zSIZE; z++) TT[z] = X[z];
for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4)
{
X[dst_off + 0] = TT[src_off + 0];
X[dst_off + 1] = TT[src_off + 1];
X[dst_off + 2] = TT[src_off + 2];
X[dst_off + 3] = TT[src_off + 3];
}
#endif
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
@ -517,10 +542,14 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len);
for (u32 l = 0; l < SCRYPT_CNT4; l += 4)
for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4)
{
for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8)
{
uint4 X[4];
const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0);
X[0] = tmps[gid].P[l + 0];
X[1] = tmps[gid].P[l + 1];
X[2] = tmps[gid].P[l + 2];
@ -569,6 +598,7 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t))
sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64);
}
}
w0[0] = 1;
w0[1] = 0;