diff --git a/OpenCL/m31400_a0-optimized.cl b/OpenCL/m31400_a0-optimized.cl new file mode 100644 index 000000000..67af97d2d --- /dev/null +++ b/OpenCL/m31400_a0-optimized.cl @@ -0,0 +1,750 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include M2S(INCLUDE_PATH/inc_vendor.h) +#include M2S(INCLUDE_PATH/inc_types.h) +#include M2S(INCLUDE_PATH/inc_platform.cl) +#include M2S(INCLUDE_PATH/inc_common.cl) +#include M2S(INCLUDE_PATH/inc_rp_optimized.h) +#include M2S(INCLUDE_PATH/inc_rp_optimized.cl) +#include M2S(INCLUDE_PATH/inc_simd.cl) +#include M2S(INCLUDE_PATH/inc_hash_sha256.cl) +#include M2S(INCLUDE_PATH/inc_cipher_aes.h) +#include M2S(INCLUDE_PATH/inc_cipher_aes.cl) +#endif + +typedef struct scrtv2 +{ + u32 ct_buf[64]; + int ct_len; + +} scrtv2_t; + +DECLSPEC void shift_buffer_by_offset (PRIVATE_AS u32 *w0, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + w0[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_be_S ( 0, w0[0], offset); + break; + + case 1: + w0[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[0] = 0; + break; + + case 2: + w0[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w0[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + default: + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif + + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S(0x0706050403020100UL >> ((offset & 3) * 8)); + #endif + + switch (offset_switch) + { + case 0: + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); + break; + + case 1: + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); + w0[0] = 0; + break; + + case 2: + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w0[3] = hc_byte_perm_S (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + default: + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} + +DECLSPEC void aes256_scrt_format (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32 *pw, const u32 pw_len, PRIVATE_AS u32 *hash, PRIVATE_AS u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) +{ + AES256_set_encrypt_key (aes_ks, hash, s_te0, s_te1, s_te2, s_te3); + + shift_buffer_by_offset (hash, pw_len + 4); + + hash[0] = hc_swap32_S (pw_len); + hash[1] |= hc_swap32_S (pw[0]); + hash[2] |= hc_swap32_S (pw[1]); + hash[3] |= hc_swap32_S (pw[2]); + + AES256_encrypt (aes_ks, hash, out, s_te0, s_te1, s_te2, s_te3, s_te4); +} + +DECLSPEC void aes256_scrt_format_VV (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32x *w, const u32x pw_len, PRIVATE_AS u32x *h, PRIVATE_AS u32x *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) +{ + #if VECT_SIZE == 1 + aes256_scrt_format (aes_ks, w, pw_len, h, out, s_te0, s_te1, s_te2, s_te3, s_te4); + #endif + + #if VECT_SIZE >= 2 + u32 tmp_w[4]; + u32 tmp_h[8]; + u32 tmp_out[4]; + + //s0 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s0; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s0; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s0, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s0 = tmp_out[i]; + + //s1 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s1; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s1; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s1, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s1 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 4 + + //s2 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s2; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s2; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s2, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s2 = tmp_out[i]; + + //s3 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s3; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s3; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s3, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s3 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 8 + + //s4 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s4; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s4; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s4, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s4 = tmp_out[i]; + + //s5 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s5; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s5; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s5, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s5 = tmp_out[i]; + + //s6 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s6; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s6; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s6, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s6 = tmp_out[i]; + + //s7 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s7; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s7; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s7, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s7 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 16 + + //s8 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s8; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s8; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s8, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s8 = tmp_out[i]; + + //s9 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s9; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s9; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s9, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s9 = tmp_out[i]; + + //sa + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sa; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sa; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sa, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sa = tmp_out[i]; + + //sb + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sb; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sb; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sb, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sb = tmp_out[i]; + + //sc + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sc; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sc; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sc, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sc = tmp_out[i]; + + //sd + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sd; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sd; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sd, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sd = tmp_out[i]; + + //se + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].se; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].se; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.se, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].se = tmp_out[i]; + + //sf + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sf; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sf; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sf, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sf = tmp_out[i]; + + #endif +} + +KERNEL_FQ void m31400_m16 (KERN_ATTR_RULES_ESALT (scrtv2_t)) +{ +} + +KERNEL_FQ void m31400_m08 (KERN_ATTR_RULES_ESALT (scrtv2_t)) +{ +} + +KERNEL_FQ void m31400_m04 (KERN_ATTR_RULES_ESALT (scrtv2_t)) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS(); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + /** + * base + */ + + u32 ks[60]; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len & 63; + + u32x wt[4]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + wt[0] = w0[0]; + wt[1] = w0[1]; + wt[2] = w0[2]; + wt[3] = w0[3]; + + append_0x80_2x4_VV (w0, w1, out_len); + + /** + * sha256 + */ + + u32x w0_t = hc_swap32 (w0[0]); + u32x w1_t = hc_swap32 (w0[1]); + u32x w2_t = hc_swap32 (w0[2]); + u32x w3_t = hc_swap32 (w0[3]); + u32x w4_t = hc_swap32 (w1[0]); + u32x w5_t = hc_swap32 (w1[1]); + u32x w6_t = hc_swap32 (w1[2]); + u32x w7_t = hc_swap32 (w1[3]); + u32x w8_t = hc_swap32 (w2[0]); + u32x w9_t = hc_swap32 (w2[1]); + u32x wa_t = hc_swap32 (w2[2]); + u32x wb_t = hc_swap32 (w2[3]); + u32x wc_t = hc_swap32 (w3[0]); + u32x wd_t = hc_swap32 (w3[1]); + u32x we_t = 0; + u32x wf_t = out_len * 8; + + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; + + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C02); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C03); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C04); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C05); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C06); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C07); + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C08); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C09); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C0a); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C0b); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C0c); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C0d); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C0e); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C0f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C10); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C11); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C12); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C13); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C14); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C15); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C16); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C17); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C18); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C19); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C1a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C1b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C1c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C1d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C1e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C1f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C20); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C21); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C22); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C23); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C24); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C25); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C26); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C27); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C28); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C29); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C2a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C2b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C2c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C2d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C2e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C2f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C30); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C31); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C32); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C33); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C34); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C35); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C36); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C37); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C38); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C39); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); + + u32x digest[8]; + + digest[0] = a + make_u32x (SHA256M_A); + digest[1] = b + make_u32x (SHA256M_B); + digest[2] = c + make_u32x (SHA256M_C); + digest[3] = d + make_u32x (SHA256M_D); + digest[4] = e + make_u32x (SHA256M_E); + digest[5] = f + make_u32x (SHA256M_F); + digest[6] = g + make_u32x (SHA256M_G); + digest[7] = h + make_u32x (SHA256M_H); + + u32x out[4] = { 0 }; + + aes256_scrt_format_VV (ks, wt, out_len, digest, out, s_te0, s_te1, s_te2, s_te3, s_te4); + + const u32x r0 = out[DGST_R0]; + const u32x r1 = out[DGST_R1]; + const u32x r2 = out[DGST_R2]; + const u32x r3 = out[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m31400_s16 (KERN_ATTR_RULES_ESALT (scrtv2_t)) +{ +} + +KERNEL_FQ void m31400_s08 (KERN_ATTR_RULES_ESALT (scrtv2_t)) +{ +} + +KERNEL_FQ void m31400_s04 (KERN_ATTR_RULES_ESALT (scrtv2_t)) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS(); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + /** + * base + */ + + u32 ks[60]; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len & 63; + + u32x wt[4]; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + wt[0] = w0[0]; + wt[1] = w0[1]; + wt[2] = w0[2]; + wt[3] = w0[3]; + + append_0x80_2x4_VV (w0, w1, out_len); + + /** + * sha256 + */ + + u32x w0_t = hc_swap32 (w0[0]); + u32x w1_t = hc_swap32 (w0[1]); + u32x w2_t = hc_swap32 (w0[2]); + u32x w3_t = hc_swap32 (w0[3]); + u32x w4_t = hc_swap32 (w1[0]); + u32x w5_t = hc_swap32 (w1[1]); + u32x w6_t = hc_swap32 (w1[2]); + u32x w7_t = hc_swap32 (w1[3]); + u32x w8_t = hc_swap32 (w2[0]); + u32x w9_t = hc_swap32 (w2[1]); + u32x wa_t = hc_swap32 (w2[2]); + u32x wb_t = hc_swap32 (w2[3]); + u32x wc_t = hc_swap32 (w3[0]); + u32x wd_t = hc_swap32 (w3[1]); + u32x we_t = 0; + u32x wf_t = out_len * 8; + + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; + + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C02); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C03); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C04); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C05); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C06); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C07); + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C08); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C09); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C0a); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C0b); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C0c); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C0d); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C0e); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C0f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C10); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C11); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C12); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C13); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C14); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C15); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C16); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C17); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C18); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C19); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C1a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C1b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C1c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C1d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C1e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C1f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C20); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C21); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C22); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C23); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C24); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C25); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C26); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C27); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C28); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C29); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C2a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C2b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C2c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C2d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C2e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C2f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C30); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C31); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C32); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C33); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C34); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C35); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C36); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C37); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C38); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C39); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); + + u32x digest[8]; + + digest[0] = a + make_u32x (SHA256M_A); + digest[1] = b + make_u32x (SHA256M_B); + digest[2] = c + make_u32x (SHA256M_C); + digest[3] = d + make_u32x (SHA256M_D); + digest[4] = e + make_u32x (SHA256M_E); + digest[5] = f + make_u32x (SHA256M_F); + digest[6] = g + make_u32x (SHA256M_G); + digest[7] = h + make_u32x (SHA256M_H); + + u32x out[4] = { 0 }; + + aes256_scrt_format_VV (ks, wt, out_len, digest, out, s_te0, s_te1, s_te2, s_te3, s_te4); + + const u32x r0 = out[DGST_R0]; + const u32x r1 = out[DGST_R1]; + const u32x r2 = out[DGST_R2]; + const u32x r3 = out[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m31400_a0-pure.cl b/OpenCL/m31400_a0-pure.cl index c75aa1eca..28cd2c51a 100644 --- a/OpenCL/m31400_a0-pure.cl +++ b/OpenCL/m31400_a0-pure.cl @@ -184,10 +184,10 @@ KERNEL_FQ void m31400_mxx (KERN_ATTR_RULES_ESALT (scrtv2_t)) * base */ - COPY_PW (pws[gid]); - u32 ks[60]; + COPY_PW (pws[gid]); + /** * loop */ @@ -208,7 +208,7 @@ KERNEL_FQ void m31400_mxx (KERN_ATTR_RULES_ESALT (scrtv2_t)) u32 out[4] = { 0 }; - aes256_scrt_format (ks, tmp.i, tmp.pw_len, ctx.h, out,s_te0, s_te1, s_te2, s_te3, s_te4); + aes256_scrt_format (ks, tmp.i, tmp.pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); const u32 r0 = out[DGST_R0]; const u32 r1 = out[DGST_R1]; @@ -280,10 +280,10 @@ KERNEL_FQ void m31400_sxx (KERN_ATTR_RULES_ESALT (scrtv2_t)) * base */ - COPY_PW (pws[gid]); - u32 ks[60]; + COPY_PW (pws[gid]); + /** * loop */ @@ -304,7 +304,7 @@ KERNEL_FQ void m31400_sxx (KERN_ATTR_RULES_ESALT (scrtv2_t)) u32 out[4] = { 0 }; - aes256_scrt_format (ks, tmp.i, tmp.pw_len, ctx.h, out,s_te0, s_te1, s_te2, s_te3, s_te4); + aes256_scrt_format (ks, tmp.i, tmp.pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); const u32 r0 = out[DGST_R0]; const u32 r1 = out[DGST_R1]; diff --git a/OpenCL/m31400_a1-optimized.cl b/OpenCL/m31400_a1-optimized.cl new file mode 100644 index 000000000..f37066f84 --- /dev/null +++ b/OpenCL/m31400_a1-optimized.cl @@ -0,0 +1,901 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include M2S(INCLUDE_PATH/inc_vendor.h) +#include M2S(INCLUDE_PATH/inc_types.h) +#include M2S(INCLUDE_PATH/inc_platform.cl) +#include M2S(INCLUDE_PATH/inc_common.cl) +#include M2S(INCLUDE_PATH/inc_simd.cl) +#include M2S(INCLUDE_PATH/inc_hash_sha256.cl) +#include M2S(INCLUDE_PATH/inc_cipher_aes.cl) +#endif + +typedef struct scrtv2 +{ + u32 ct_buf[64]; + int ct_len; + +} scrtv2_t; + +DECLSPEC void shift_buffer_by_offset (PRIVATE_AS u32 *w0, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + w0[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_be_S ( 0, w0[0], offset); + break; + + case 1: + w0[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[0] = 0; + break; + + case 2: + w0[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w0[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + default: + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif + + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S(0x0706050403020100UL >> ((offset & 3) * 8)); + #endif + + switch (offset_switch) + { + case 0: + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); + break; + + case 1: + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); + w0[0] = 0; + break; + + case 2: + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w0[3] = hc_byte_perm_S (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + default: + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} + +DECLSPEC void aes256_scrt_format (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32 *pw, const u32 pw_len, PRIVATE_AS u32 *hash, PRIVATE_AS u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) +{ + AES256_set_encrypt_key (aes_ks, hash, s_te0, s_te1, s_te2, s_te3); + + shift_buffer_by_offset (hash, pw_len + 4); + + hash[0] = hc_swap32_S (pw_len); + hash[1] |= hc_swap32_S (pw[0]); + hash[2] |= hc_swap32_S (pw[1]); + hash[3] |= hc_swap32_S (pw[2]); + + AES256_encrypt (aes_ks, hash, out, s_te0, s_te1, s_te2, s_te3, s_te4); +} + +DECLSPEC void aes256_scrt_format_VV (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32x *w, const u32x pw_len, PRIVATE_AS u32x *h, PRIVATE_AS u32x *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) +{ + #if VECT_SIZE == 1 + aes256_scrt_format (aes_ks, w, pw_len, h, out, s_te0, s_te1, s_te2, s_te3, s_te4); + #endif + + #if VECT_SIZE >= 2 + u32 tmp_w[4]; + u32 tmp_h[8]; + u32 tmp_out[4]; + + //s0 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s0; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s0; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s0, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s0 = tmp_out[i]; + + //s1 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s1; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s1; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s1, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s1 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 4 + + //s2 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s2; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s2; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s2, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s2 = tmp_out[i]; + + //s3 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s3; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s3; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s3, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s3 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 8 + + //s4 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s4; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s4; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s4, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s4 = tmp_out[i]; + + //s5 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s5; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s5; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s5, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s5 = tmp_out[i]; + + //s6 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s6; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s6; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s6, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s6 = tmp_out[i]; + + //s7 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s7; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s7; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s7, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s7 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 16 + + //s8 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s8; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s8; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s8, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s8 = tmp_out[i]; + + //s9 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s9; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s9; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.s9, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s9 = tmp_out[i]; + + //sa + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sa; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sa; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sa, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sa = tmp_out[i]; + + //sb + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sb; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sb; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sb, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sb = tmp_out[i]; + + //sc + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sc; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sc; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sc, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sc = tmp_out[i]; + + //sd + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sd; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sd; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sd, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sd = tmp_out[i]; + + //se + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].se; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].se; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.se, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].se = tmp_out[i]; + + //sf + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sf; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sf; + + aes256_scrt_format (aes_ks, tmp_w, pw_len.sf, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sf = tmp_out[i]; + + #endif +} + +KERNEL_FQ void m31400_m04 (KERN_ATTR_ESALT (scrtv2_t)) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS(); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + /** + * base + */ + + u32 ks[60]; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len & 63; + + u32x wt[4]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63; + + const u32x pw_len = (pw_l_len + pw_r_len) & 63; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + wt[0] = w0[0]; + wt[1] = w0[1]; + wt[2] = w0[2]; + wt[3] = w0[3]; + + w0[0] = hc_swap32 (w0[0]); + w0[1] = hc_swap32 (w0[1]); + w0[2] = hc_swap32 (w0[2]); + w0[3] = hc_swap32 (w0[3]); + w1[0] = hc_swap32 (w1[0]); + w1[1] = hc_swap32 (w1[1]); + w1[2] = hc_swap32 (w1[2]); + w1[3] = hc_swap32 (w1[3]); + w2[0] = hc_swap32 (w2[0]); + w2[1] = hc_swap32 (w2[1]); + w2[2] = hc_swap32 (w2[2]); + w2[3] = hc_swap32 (w2[3]); + w3[0] = hc_swap32 (w3[0]); + w3[1] = hc_swap32 (w3[1]); + w3[2] = hc_swap32 (w3[2]); + w3[3] = hc_swap32 (w3[3]); + + append_0x80_4x4_VV (w0, w1, w2, w3, pw_len ^ 3); + + /** + * sha256 + */ + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; + + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; + + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C02); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C03); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C04); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C05); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C06); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C07); + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C08); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C09); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C0a); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C0b); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C0c); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C0d); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C0e); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C0f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C10); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C11); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C12); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C13); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C14); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C15); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C16); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C17); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C18); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C19); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C1a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C1b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C1c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C1d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C1e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C1f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C20); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C21); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C22); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C23); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C24); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C25); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C26); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C27); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C28); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C29); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C2a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C2b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C2c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C2d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C2e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C2f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C30); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C31); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C32); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C33); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C34); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C35); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C36); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C37); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C38); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C39); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); + + u32x digest[8]; + + digest[0] = a + make_u32x (SHA256M_A); + digest[1] = b + make_u32x (SHA256M_B); + digest[2] = c + make_u32x (SHA256M_C); + digest[3] = d + make_u32x (SHA256M_D); + digest[4] = e + make_u32x (SHA256M_E); + digest[5] = f + make_u32x (SHA256M_F); + digest[6] = g + make_u32x (SHA256M_G); + digest[7] = h + make_u32x (SHA256M_H); + + u32x out[4] = { 0 }; + + aes256_scrt_format_VV (ks, wt, pw_len, digest, out, s_te0, s_te1, s_te2, s_te3, s_te4); + + const u32x r0 = out[DGST_R0]; + const u32x r1 = out[DGST_R1]; + const u32x r2 = out[DGST_R2]; + const u32x r3 = out[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m31400_m08 (KERN_ATTR_BASIC ()) +{ +} + +KERNEL_FQ void m31400_m16 (KERN_ATTR_BASIC ()) +{ +} + +KERNEL_FQ void m31400_s04 (KERN_ATTR_BASIC ()) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS(); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + /** + * base + */ + + u32 ks[60]; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len & 63; + + u32x wt[4]; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63; + + const u32x pw_len = (pw_l_len + pw_r_len) & 63; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + wt[0] = w0[0]; + wt[1] = w0[1]; + wt[2] = w0[2]; + wt[3] = w0[3]; + + w0[0] = hc_swap32 (w0[0]); + w0[1] = hc_swap32 (w0[1]); + w0[2] = hc_swap32 (w0[2]); + w0[3] = hc_swap32 (w0[3]); + w1[0] = hc_swap32 (w1[0]); + w1[1] = hc_swap32 (w1[1]); + w1[2] = hc_swap32 (w1[2]); + w1[3] = hc_swap32 (w1[3]); + w2[0] = hc_swap32 (w2[0]); + w2[1] = hc_swap32 (w2[1]); + w2[2] = hc_swap32 (w2[2]); + w2[3] = hc_swap32 (w2[3]); + w3[0] = hc_swap32 (w3[0]); + w3[1] = hc_swap32 (w3[1]); + w3[2] = hc_swap32 (w3[2]); + w3[3] = hc_swap32 (w3[3]); + + append_0x80_4x4_VV (w0, w1, w2, w3, pw_len ^ 3); + + /** + * sha256 + */ + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; + + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; + + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C02); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C03); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C04); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C05); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C06); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C07); + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C08); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C09); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C0a); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C0b); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C0c); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C0d); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C0e); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C0f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C10); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C11); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C12); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C13); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C14); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C15); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C16); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C17); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C18); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C19); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C1a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C1b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C1c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C1d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C1e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C1f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C20); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C21); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C22); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C23); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C24); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C25); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C26); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C27); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C28); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C29); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C2a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C2b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C2c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C2d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C2e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C2f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C30); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C31); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C32); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C33); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C34); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C35); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C36); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C37); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C38); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C39); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); + + u32x digest[8]; + + digest[0] = a + make_u32x (SHA256M_A); + digest[1] = b + make_u32x (SHA256M_B); + digest[2] = c + make_u32x (SHA256M_C); + digest[3] = d + make_u32x (SHA256M_D); + digest[4] = e + make_u32x (SHA256M_E); + digest[5] = f + make_u32x (SHA256M_F); + digest[6] = g + make_u32x (SHA256M_G); + digest[7] = h + make_u32x (SHA256M_H); + + u32x out[4] = { 0 }; + + aes256_scrt_format_VV (ks, wt, pw_len, digest, out, s_te0, s_te1, s_te2, s_te3, s_te4); + + const u32x r0 = out[DGST_R0]; + const u32x r1 = out[DGST_R1]; + const u32x r2 = out[DGST_R2]; + const u32x r3 = out[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m31400_s08 (KERN_ATTR_BASIC ()) +{ +} + +KERNEL_FQ void m31400_s16 (KERN_ATTR_BASIC ()) +{ +} diff --git a/OpenCL/m31400_a1-pure.cl b/OpenCL/m31400_a1-pure.cl index 4c6ea8b33..390bc8bf3 100644 --- a/OpenCL/m31400_a1-pure.cl +++ b/OpenCL/m31400_a1-pure.cl @@ -184,14 +184,16 @@ KERNEL_FQ void m31400_mxx (KERN_ATTR_ESALT (scrtv2_t)) * base */ + u32 wt[3]; + + u32 ks[60]; + sha256_ctx_t ctx0; sha256_init (&ctx0); sha256_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); - u32 ks[60]; - /** * loop */ @@ -202,19 +204,17 @@ KERNEL_FQ void m31400_mxx (KERN_ATTR_ESALT (scrtv2_t)) sha256_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); - u32 pw_candidate[3]; - - pw_candidate[0] = hc_swap32_S (ctx.w0[0]); - pw_candidate[1] = hc_swap32_S (ctx.w0[1]); - pw_candidate[2] = hc_swap32_S (ctx.w0[2]); + wt[0] = hc_swap32_S (ctx.w0[0]); + wt[1] = hc_swap32_S (ctx.w0[1]); + wt[2] = hc_swap32_S (ctx.w0[2]); - u32 pw_len=ctx.len; + u32 pw_len = ctx.len; sha256_final (&ctx); u32 out[4] = { 0 }; - aes256_scrt_format (ks, pw_candidate, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); + aes256_scrt_format (ks, wt, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); const u32 r0 = out[DGST_R0]; const u32 r1 = out[DGST_R1]; @@ -286,14 +286,16 @@ KERNEL_FQ void m31400_sxx (KERN_ATTR_ESALT (scrtv2_t)) * base */ + u32 wt[3]; + + u32 ks[60]; + sha256_ctx_t ctx0; sha256_init (&ctx0); sha256_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); - u32 ks[60]; - /** * loop */ @@ -304,19 +306,17 @@ KERNEL_FQ void m31400_sxx (KERN_ATTR_ESALT (scrtv2_t)) sha256_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); - u32 pw_candidate[3]; - - pw_candidate[0] = hc_swap32_S (ctx.w0[0]); - pw_candidate[1] = hc_swap32_S (ctx.w0[1]); - pw_candidate[2] = hc_swap32_S (ctx.w0[2]); + wt[0] = hc_swap32_S (ctx.w0[0]); + wt[1] = hc_swap32_S (ctx.w0[1]); + wt[2] = hc_swap32_S (ctx.w0[2]); - u32 pw_len=ctx.len; + u32 pw_len = ctx.len; sha256_final (&ctx); u32 out[4] = { 0 }; - aes256_scrt_format (ks, pw_candidate, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); + aes256_scrt_format (ks, wt, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); const u32 r0 = out[DGST_R0]; const u32 r1 = out[DGST_R1]; diff --git a/OpenCL/m31400_a3-optimized.cl b/OpenCL/m31400_a3-optimized.cl new file mode 100644 index 000000000..a9c43e961 --- /dev/null +++ b/OpenCL/m31400_a3-optimized.cl @@ -0,0 +1,1104 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include M2S(INCLUDE_PATH/inc_vendor.h) +#include M2S(INCLUDE_PATH/inc_types.h) +#include M2S(INCLUDE_PATH/inc_platform.cl) +#include M2S(INCLUDE_PATH/inc_common.h) +#include M2S(INCLUDE_PATH/inc_common.cl) +#include M2S(INCLUDE_PATH/inc_simd.cl) +#include M2S(INCLUDE_PATH/inc_hash_sha256.cl) +#include M2S(INCLUDE_PATH/inc_cipher_aes.h) +#include M2S(INCLUDE_PATH/inc_cipher_aes.cl) +#endif + +typedef struct scrtv2 +{ + u32 ct_buf[64]; + int ct_len; + +} scrtv2_t; + +DECLSPEC void shift_buffer_by_offset (PRIVATE_AS u32 *w0, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + w0[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_be_S ( 0, w0[0], offset); + break; + + case 1: + w0[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[0] = 0; + break; + + case 2: + w0[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w0[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + default: + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif + + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S(0x0706050403020100UL >> ((offset & 3) * 8)); + #endif + + switch (offset_switch) + { + case 0: + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); + break; + + case 1: + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); + w0[0] = 0; + break; + + case 2: + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w0[3] = hc_byte_perm_S (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + default: + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} + +DECLSPEC void aes256_scrt_format (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32 *pw, const u32 pw_len, PRIVATE_AS u32 *hash, PRIVATE_AS u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) +{ + AES256_set_encrypt_key (aes_ks, hash, s_te0, s_te1, s_te2, s_te3); + + shift_buffer_by_offset (hash, pw_len + 4); + + hash[0] = hc_swap32_S (pw_len); + hash[1] |= hc_swap32_S (pw[0]); + hash[2] |= hc_swap32_S (pw[1]); + hash[3] |= hc_swap32_S (pw[2]); + + AES256_encrypt (aes_ks, hash, out, s_te0, s_te1, s_te2, s_te3, s_te4); +} + +DECLSPEC void aes256_scrt_format_VV (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32x *w, const u32 pw_len, PRIVATE_AS u32x *h, PRIVATE_AS u32x *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) +{ + #if VECT_SIZE == 1 + aes256_scrt_format (aes_ks, w, pw_len, h, out, s_te0, s_te1, s_te2, s_te3, s_te4); + #endif + + #if VECT_SIZE >= 2 + u32 tmp_w[4]; + u32 tmp_h[8]; + u32 tmp_out[4]; + + //s0 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s0; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s0; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s0 = tmp_out[i]; + + //s1 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s1; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s1; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s1 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 4 + + //s2 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s2; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s2; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s2 = tmp_out[i]; + + //s3 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s3; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s3; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s3 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 8 + + //s4 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s4; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s4; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s4 = tmp_out[i]; + + //s5 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s5; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s5; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s5 = tmp_out[i]; + + //s6 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s6; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s6; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s6 = tmp_out[i]; + + //s7 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s7; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s7; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s7 = tmp_out[i]; + + #endif + + #if VECT_SIZE >= 16 + + //s8 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s8; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s8; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s8 = tmp_out[i]; + + //s9 + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s9; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s9; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].s9 = tmp_out[i]; + + //sa + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sa; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sa; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sa = tmp_out[i]; + + //sb + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sb; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sb; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sb = tmp_out[i]; + + //sc + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sc; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sc; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sc = tmp_out[i]; + + //sd + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sd; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sd; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sd = tmp_out[i]; + + //se + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].se; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].se; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].se = tmp_out[i]; + + //sf + + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sf; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sf; + + aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); + + for (u32 i = 0; i < 4; i++) out[i].sf = tmp_out[i]; + + #endif +} + +DECLSPEC void m31400m (SHM_TYPE u32a *s_te0, SHM_TYPE u32a *s_te1, SHM_TYPE u32a *s_te2, SHM_TYPE u32a *s_te3, SHM_TYPE u32a *s_te4, PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTOR_ESALT (scrtv2_t)) +{ + /** + * modifiers are taken from args + */ + + /** + * loop + */ + + u32 ks[60]; + + u32x wt[4]; + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + wt[0] = w0; + wt[1] = w[1]; + wt[2] = w[2]; + wt[3] = w[3]; + + u32x ww0[4]; + u32x ww1[4]; + u32x ww2[4]; + u32x ww3[4]; + + ww0[0] = hc_swap32 (w0); + ww0[1] = hc_swap32 (w[1]); + ww0[2] = hc_swap32 (w[2]); + ww0[3] = hc_swap32 (w[3]); + ww1[0] = hc_swap32 (w[4]); + ww1[1] = hc_swap32 (w[5]); + ww1[2] = hc_swap32 (w[6]); + ww1[3] = hc_swap32 (w[7]); + ww2[0] = hc_swap32 (w[8]); + ww2[1] = hc_swap32 (w[9]); + ww2[2] = hc_swap32 (w[10]); + ww2[3] = hc_swap32 (w[11]); + ww3[0] = hc_swap32 (w[12]); + ww3[1] = hc_swap32 (w[13]); + ww3[2] = hc_swap32 (w[14]); + ww3[3] = hc_swap32 (w[15]); + + const int off = pw_len & 63; + + append_0x80_4x4 (ww0, ww1, ww2, ww3, off ^ 3); + + ww3[2] = 0; + ww3[3] = pw_len * 8; + + u32x w0_t = ww0[0]; + u32x w1_t = ww0[1]; + u32x w2_t = ww0[2]; + u32x w3_t = ww0[3]; + u32x w4_t = ww1[0]; + u32x w5_t = ww1[1]; + u32x w6_t = ww1[2]; + u32x w7_t = ww1[3]; + u32x w8_t = ww2[0]; + u32x w9_t = ww2[1]; + u32x wa_t = ww2[2]; + u32x wb_t = ww2[3]; + u32x wc_t = ww3[0]; + u32x wd_t = ww3[1]; + u32x we_t = ww3[2]; + u32x wf_t = ww3[3]; + + // sha256 + + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; + + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C02); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C03); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C04); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C05); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C06); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C07); + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C08); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C09); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C0a); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C0b); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C0c); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C0d); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C0e); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C0f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C10); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C11); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C12); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C13); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C14); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C15); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C16); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C17); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C18); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C19); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C1a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C1b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C1c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C1d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C1e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C1f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C20); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C21); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C22); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C23); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C24); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C25); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C26); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C27); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C28); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C29); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C2a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C2b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C2c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C2d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C2e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C2f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C30); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C31); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C32); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C33); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C34); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C35); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C36); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C37); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C38); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C39); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); + + u32x digest[8]; + + digest[0] = a + make_u32x (SHA256M_A); + digest[1] = b + make_u32x (SHA256M_B); + digest[2] = c + make_u32x (SHA256M_C); + digest[3] = d + make_u32x (SHA256M_D); + digest[4] = e + make_u32x (SHA256M_E); + digest[5] = f + make_u32x (SHA256M_F); + digest[6] = g + make_u32x (SHA256M_G); + digest[7] = h + make_u32x (SHA256M_H); + + u32x out[4] = { 0 }; + + aes256_scrt_format_VV (ks, wt, pw_len, digest, out, s_te0, s_te1, s_te2, s_te3, s_te4); + + const u32x r0 = out[DGST_R0]; + const u32x r1 = out[DGST_R1]; + const u32x r2 = out[DGST_R2]; + const u32x r3 = out[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m31400_m04 (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) +{ + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS (); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m31400m (s_te0, s_te1, s_te2, s_te3, s_te4, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz); +} + +KERNEL_FQ void m31400_m08 (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) +{ + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS (); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m31400m (s_te0, s_te1, s_te2, s_te3, s_te4, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz); +} + +KERNEL_FQ void m31400_m16 (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) +{ + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS (); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m31400m (s_te0, s_te1, s_te2, s_te3, s_te4, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz); +} + +DECLSPEC void m31400s (SHM_TYPE u32a *s_te0, SHM_TYPE u32a *s_te1, SHM_TYPE u32a *s_te2, SHM_TYPE u32a *s_te3, SHM_TYPE u32a *s_te4, PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTOR_ESALT (scrtv2_t)) +{ + /** + * modifiers are taken from args + */ + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 ks[60]; + + u32x wt[4]; + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + wt[0] = w0; + wt[1] = w[1]; + wt[2] = w[2]; + wt[3] = w[3]; + + u32x ww0[4]; + u32x ww1[4]; + u32x ww2[4]; + u32x ww3[4]; + + ww0[0] = hc_swap32 (w0); + ww0[1] = hc_swap32 (w[1]); + ww0[2] = hc_swap32 (w[2]); + ww0[3] = hc_swap32 (w[3]); + ww1[0] = hc_swap32 (w[4]); + ww1[1] = hc_swap32 (w[5]); + ww1[2] = hc_swap32 (w[6]); + ww1[3] = hc_swap32 (w[7]); + ww2[0] = hc_swap32 (w[8]); + ww2[1] = hc_swap32 (w[9]); + ww2[2] = hc_swap32 (w[10]); + ww2[3] = hc_swap32 (w[11]); + ww3[0] = hc_swap32 (w[12]); + ww3[1] = hc_swap32 (w[13]); + ww3[2] = hc_swap32 (w[14]); + ww3[3] = hc_swap32 (w[15]); + + const int off = pw_len & 63; + + append_0x80_4x4 (ww0, ww1, ww2, ww3, off ^ 3); + + ww3[2] = 0; + ww3[3] = pw_len * 8; + + u32x w0_t = ww0[0]; + u32x w1_t = ww0[1]; + u32x w2_t = ww0[2]; + u32x w3_t = ww0[3]; + u32x w4_t = ww1[0]; + u32x w5_t = ww1[1]; + u32x w6_t = ww1[2]; + u32x w7_t = ww1[3]; + u32x w8_t = ww2[0]; + u32x w9_t = ww2[1]; + u32x wa_t = ww2[2]; + u32x wb_t = ww2[3]; + u32x wc_t = ww3[0]; + u32x wd_t = ww3[1]; + u32x we_t = ww3[2]; + u32x wf_t = ww3[3]; + + // sha256 + + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; + + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C02); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C03); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C04); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C05); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C06); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C07); + SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C08); + SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C09); + SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C0a); + SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C0b); + SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C0c); + SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C0d); + SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C0e); + SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C0f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C10); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C11); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C12); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C13); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C14); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C15); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C16); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C17); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C18); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C19); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C1a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C1b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C1c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C1d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C1e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C1f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C20); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C21); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C22); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C23); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C24); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C25); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C26); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C27); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C28); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C29); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C2a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C2b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C2c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C2d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C2e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C2f); + + w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C30); + w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C31); + w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, SHA256C32); + w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, SHA256C33); + w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, SHA256C34); + w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, SHA256C35); + w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, SHA256C36); + w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, SHA256C37); + w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, SHA256C38); + w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, SHA256C39); + wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); + wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); + wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); + we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); + wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); + + u32x digest[8]; + + digest[0] = a + make_u32x (SHA256M_A); + digest[1] = b + make_u32x (SHA256M_B); + digest[2] = c + make_u32x (SHA256M_C); + digest[3] = d + make_u32x (SHA256M_D); + digest[4] = e + make_u32x (SHA256M_E); + digest[5] = f + make_u32x (SHA256M_F); + digest[6] = g + make_u32x (SHA256M_G); + digest[7] = h + make_u32x (SHA256M_H); + + u32x out[4] = { 0 }; + + aes256_scrt_format_VV (ks, wt, pw_len, digest, out, s_te0, s_te1, s_te2, s_te3, s_te4); + + const u32x r0 = out[DGST_R0]; + const u32x r1 = out[DGST_R1]; + const u32x r2 = out[DGST_R2]; + const u32x r3 = out[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m31400_s04 (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) +{ + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS (); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + u32 w[16]; + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m31400s (s_te0, s_te1, s_te2, s_te3, s_te4, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz); +} + +KERNEL_FQ void m31400_s08 (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) +{ + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS (); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m31400s (s_te0, s_te1, s_te2, s_te3, s_te4, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz); +} + +KERNEL_FQ void m31400_s16 (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) +{ + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * aes shared + */ + + #ifdef REAL_SHM + + LOCAL_VK u32 s_te0[256]; + LOCAL_VK u32 s_te1[256]; + LOCAL_VK u32 s_te2[256]; + LOCAL_VK u32 s_te3[256]; + LOCAL_VK u32 s_te4[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_te0[i] = te0[i]; + s_te1[i] = te1[i]; + s_te2[i] = te2[i]; + s_te3[i] = te3[i]; + s_te4[i] = te4[i]; + } + + SYNC_THREADS (); + + #else + + CONSTANT_AS u32a *s_te0 = te0; + CONSTANT_AS u32a *s_te1 = te1; + CONSTANT_AS u32a *s_te2 = te2; + CONSTANT_AS u32a *s_te3 = te3; + CONSTANT_AS u32a *s_te4 = te4; + + #endif + + if (gid >= GID_CNT) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m31400s (s_te0, s_te1, s_te2, s_te3, s_te4, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz); +} diff --git a/OpenCL/m31400_a3-pure.cl b/OpenCL/m31400_a3-pure.cl index 70ce410ba..7baacb913 100644 --- a/OpenCL/m31400_a3-pure.cl +++ b/OpenCL/m31400_a3-pure.cl @@ -1,7 +1,7 @@ /** -* Author......: See docs/credits.txt -* License.....: MIT -*/ + * Author......: See docs/credits.txt + * License.....: MIT + */ #define NEW_SIMD_CODE @@ -142,27 +142,27 @@ DECLSPEC void aes256_scrt_format_VV (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32x *w, #endif #if VECT_SIZE >= 2 - u32 tmp_w[64]; + u32 tmp_w[4]; u32 tmp_h[8]; u32 tmp_out[4]; //s0 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s0; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s0; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s0; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s0; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s0 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s0 = tmp_out[i]; //s1 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s1; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s1; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s1; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s1; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s1 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s1 = tmp_out[i]; #endif @@ -170,21 +170,21 @@ DECLSPEC void aes256_scrt_format_VV (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32x *w, //s2 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s2; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s2; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s2; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s2; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s2 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s2 = tmp_out[i]; //s3 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s3; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s3; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s3; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s3; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s3 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s3 = tmp_out[i]; #endif @@ -192,39 +192,39 @@ DECLSPEC void aes256_scrt_format_VV (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32x *w, //s4 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s4; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s4; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s4; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s4; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s4 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s4 = tmp_out[i]; //s5 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s5; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s5; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s5; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s5; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s5 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s5 = tmp_out[i]; //s6 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s6; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s6; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s6; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s6; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s6 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s6 = tmp_out[i]; //s7 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s7; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s7; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s7; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s7; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s7 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s7 = tmp_out[i]; #endif @@ -232,75 +232,75 @@ DECLSPEC void aes256_scrt_format_VV (PRIVATE_AS u32 *aes_ks, PRIVATE_AS u32x *w, //s8 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s8; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s8; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s8; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s8; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s8 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s8 = tmp_out[i]; //s9 - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].s9; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s9; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].s9; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].s9; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].s9 = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].s9 = tmp_out[i]; //sa - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].sa; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sa; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sa; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sa; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].sa = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].sa = tmp_out[i]; //sb - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].sb; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sb; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sb; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sb; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].sb = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].sb = tmp_out[i]; //sc - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].sc; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sc; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sc; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sc; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].sc = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].sc = tmp_out[i]; //sd - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].sd; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sd; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sd; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sd; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].sd = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].sd = tmp_out[i]; //se - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].se; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].se; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].se; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].se; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].se = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].se = tmp_out[i]; //sf - for (u32 i = 0; i < 64; i++) tmp_w[i] = w[i].sf; - for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sf; + for (u32 i = 0; i < 4; i++) tmp_w[i] = w[i].sf; + for (u32 i = 0; i < 8; i++) tmp_h[i] = h[i].sf; aes256_scrt_format (aes_ks, tmp_w, pw_len, tmp_h, tmp_out, s_te0, s_te1, s_te2, s_te3, s_te4); - for (u32 i = 0; i < 4; i++) out[i].sf = tmp_out[i]; + for (u32 i = 0; i < 4; i++) out[i].sf = tmp_out[i]; #endif } @@ -354,10 +354,12 @@ KERNEL_FQ void m31400_mxx (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) * base */ - const u32 pw_len = pws[gid].pw_len; + u32 ks[60]; u32x w[64] = {0}; + const u32 pw_len = pws[gid].pw_len; + for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) { w[idx] = pws[gid].i[idx]; @@ -385,11 +387,9 @@ KERNEL_FQ void m31400_mxx (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) sha256_final_vector (&ctx); - u32x out[4] = {0}; + u32x out[4] = { 0 }; - u32 aes_ks[60]; - - aes256_scrt_format_VV (aes_ks, w, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); + aes256_scrt_format_VV (ks, w, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); const u32x r0 = out[DGST_R0]; const u32x r1 = out[DGST_R1]; @@ -461,9 +461,11 @@ KERNEL_FQ void m31400_sxx (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) * base */ - const u32 pw_len = pws[gid].pw_len; + u32 ks[60]; - u32x w[64] = {0}; + u32x w[64] = { 0 }; + + const u32 pw_len = pws[gid].pw_len; for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) { @@ -492,11 +494,9 @@ KERNEL_FQ void m31400_sxx (KERN_ATTR_VECTOR_ESALT (scrtv2_t)) sha256_final_vector (&ctx); - u32x out[4] = {0}; - - u32 aes_ks[60]; + u32x out[4] = { 0 }; - aes256_scrt_format_VV (aes_ks, w, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); + aes256_scrt_format_VV (ks, w, pw_len, ctx.h, out, s_te0, s_te1, s_te2, s_te3, s_te4); const u32x r0 = out[DGST_R0]; const u32x r1 = out[DGST_R1]; diff --git a/docs/changes.txt b/docs/changes.txt index 3965c968d..c9250a89a 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -83,6 +83,7 @@ - Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629 - User Options: Added error message when mixing --username and --show to warn users of exponential delay - MetaMask: update extraction tool to support MetaMask Mobile wallets +- SecureCRT MasterPassphrase v2: update module, pure kernels and test unit. Add optimized kernels. - Metal Backend: added workaround to prevent 'Infinite Loop' bug when build kernels - User Options: added --metal-compiler-runtime option diff --git a/src/modules/module_31400.c b/src/modules/module_31400.c index bcc8654bb..0f988b124 100644 --- a/src/modules/module_31400.c +++ b/src/modules/module_31400.c @@ -28,9 +28,7 @@ static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_NOT_SALTED | OPTI_TYPE_RAW_HASH; -static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE - | OPTS_TYPE_PT_ADD80 - | OPTS_TYPE_PT_ADDBITS15; +static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE; // OPTS_TYPE_PT_ADD80 and OPTS_TYPE_PT_ADDBITS15 added within kernel static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; static const char *ST_HASH = "S:\"Config Passphrase\"=02:ded7137400e0a1004a12f1708453968ccc270908ba02ab0345c83690d1de3d9937587be66ad2a7fe8cc6cb16ecff02e61ac05e09d4f49f284efd24f6b16d6ae3"; @@ -86,7 +84,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE | TOKEN_ATTR_VERIFY_SIGNATURE; token.len_min[1] = 96; - token.len_max[1] = 160; + token.len_max[1] = 224; token.attr[1] = TOKEN_ATTR_VERIFY_LENGTH | TOKEN_ATTR_VERIFY_HEX; diff --git a/tools/test_modules/m31400.pm b/tools/test_modules/m31400.pm index 60a855b6c..6eae0ef03 100644 --- a/tools/test_modules/m31400.pm +++ b/tools/test_modules/m31400.pm @@ -11,39 +11,36 @@ use warnings; use Digest::SHA qw (sha256_hex); use Crypt::CBC; -sub module_constraints { [[0, 256], [-1, -1], [0, 55], [-1, -1], [-1, -1]] } +sub module_constraints { [[0, 55], [-1, -1], [0, 55], [-1, -1], [-1, -1]] } sub calculate_padding { - my $length = shift; - my $blocksize = shift // 32; - my $minpadding = shift // 16; + my $length = shift; + my $blocksize = shift // 32; + my $minpadding = shift // 16; - my $padded_len=$length+$minpadding; - my $finalpadded=( ( $padded_len - 1 ) | ( $blocksize - 1 ) ) + 1; - - return $finalpadded - $length; + my $padded_len = $length+$minpadding; + my $finalpadded = (($padded_len - 1) | ($blocksize - 1)) + 1; + return $finalpadded - $length; } sub module_generate_hash { my $word = shift; - my $total_len = (length($word)*2) + 8 + 64; - my $padding = shift // random_hex_string (calculate_padding($total_len)); - - if (length $padding == 0){ - $padding=random_hex_string (calculate_padding($total_len)); - } + my $total_len = (length ($word) * 2) + 8 + 64; + my $padding = shift // random_hex_string (calculate_padding ($total_len)); + if (length $padding == 0) { + $padding = random_hex_string (calculate_padding ($total_len)); + } my $digest = sha256_hex ($word); - my $len = sprintf("%02d",length($word)); - my $paddedlen = sprintf("%02x000000", $len); - my $hexofword=unpack "H*",$word; + my $len = sprintf ("%02d", length ($word)); + my $paddedlen = sprintf ("%02x000000", $len); + my $hexofword = unpack "H*", $word; my $plaintext = $paddedlen . $hexofword . $digest . $padding; - my $aes = Crypt::CBC->new ({ key => pack ("H*", $digest), cipher => "Crypt::Rijndael", @@ -54,18 +51,18 @@ sub module_generate_hash padding => "none", }); - my $ciphertext=$aes->encrypt(pack("H*",$plaintext)); - my $hash = sprintf("S:\"Config Passphrase\"=02:%s",unpack ("H*",$ciphertext)); + my $ciphertext = $aes->encrypt (pack ("H*", $plaintext)); + my $hash = sprintf ("S:\"Config Passphrase\"=02:%s", unpack ("H*", $ciphertext)); return $hash } sub get_aes { - my $word_packed=shift; - my $key = sha256_hex ($word_packed); + my $word_packed = shift; + my $key = sha256_hex ($word_packed); - my $aes = Crypt::CBC->new ({ + my $aes = Crypt::CBC->new ({ key => pack ("H*", $key), cipher => "Crypt::Rijndael", iv => => "\x00" x 16, @@ -74,6 +71,7 @@ sub get_aes keysize => 32, padding => "none", }); + return $aes } @@ -87,11 +85,10 @@ sub module_verify_hash return unless defined $word; my $word_packed = pack_if_HEX_notation ($word); - my $decrypted = get_aes($word_packed)->decrypt(pack"H*", $hash); - my $plaintext_hex=unpack "H*",$decrypted; - my $passlen=hex(substr($plaintext_hex,0,2)); - my $padding = substr($plaintext_hex,8+2*$passlen+64); - + my $decrypted = get_aes ($word_packed)->decrypt (pack"H*", $hash); + my $plaintext_hex = unpack "H*", $decrypted; + my $passlen = hex (substr ($plaintext_hex, 0, 2)); + my $padding = substr ($plaintext_hex, 8 + 2 * $passlen + 64); my $new_hash = module_generate_hash ($word_packed,$padding);