merge master

pull/2884/head
Gabriele Gristina 3 years ago
commit 75d3d5503d

@ -1483,7 +1483,7 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
return r;
}
#if HAS_VPERM
#if HAS_VPERM == 1
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const int c)
{
u32x r = 0;
@ -1533,7 +1533,7 @@ DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const int c)
}
#endif
#if HAS_VADD3
#if HAS_VADD3 == 1
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
/*
@ -1711,6 +1711,15 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
return r;
}
DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
{
const int c_mod_4 = c & 3;
const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
return r;
}
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
const int c_mod_4 = c & 3;
@ -2056,6 +2065,152 @@ DECLSPEC int hc_enc_has_next (hc_enc_t *hc_enc, const int sz)
return 0;
}
DECLSPEC int hc_enc_validate_utf8 (const u32 *src_buf, const int src_pos, const int extraBytesToRead)
{
const u8 *src_ptr = (const u8 *) src_buf;
if (extraBytesToRead == 0)
{
const u8 c0 = src_ptr[src_pos + 0]; if (c0 >= 0x80) return 0;
}
else if (extraBytesToRead == 1)
{
const u8 c0 = src_ptr[src_pos + 0]; if ((c0 < 0xc2) || (c0 > 0xdf)) return 0;
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
}
else if (extraBytesToRead == 2)
{
const u8 c0 = src_ptr[src_pos + 0];
if ((c0 >= 0xe0) && (c0 <= 0xe0))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0xa0) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else if ((c0 >= 0xe1) && (c0 <= 0xec))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else if ((c0 >= 0xed) && (c0 <= 0xed))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0x9f)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else if ((c0 >= 0xee) && (c0 <= 0xef))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else
{
return 0;
}
}
else if (extraBytesToRead == 3)
{
const u8 c0 = src_ptr[src_pos + 0];
if ((c0 >= 0xf0) && (c0 <= 0xf0))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x90) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
const u8 c3 = src_ptr[src_pos + 3]; if ((c3 < 0x80) || (c3 > 0xbf)) return 0;
}
else if ((c0 >= 0xf1) && (c0 <= 0xf3))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
const u8 c3 = src_ptr[src_pos + 3]; if ((c3 < 0x80) || (c3 > 0xbf)) return 0;
}
else if ((c0 >= 0xf4) && (c0 <= 0xf4))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
const u8 c3 = src_ptr[src_pos + 3]; if ((c3 < 0x80) || (c3 > 0xbf)) return 0;
}
else
{
return 0;
}
}
return 1;
}
DECLSPEC int hc_enc_validate_utf8_global (GLOBAL_AS const u32 *src_buf, const int src_pos, const int extraBytesToRead)
{
GLOBAL_AS const u8 *src_ptr = (GLOBAL_AS const u8 *) src_buf;
if (extraBytesToRead == 0)
{
const u8 c0 = src_ptr[src_pos + 0]; if (c0 >= 0x80) return 0;
}
else if (extraBytesToRead == 1)
{
const u8 c0 = src_ptr[src_pos + 0]; if ((c0 < 0xc2) || (c0 > 0xdf)) return 0;
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
}
else if (extraBytesToRead == 2)
{
const u8 c0 = src_ptr[src_pos + 0];
if ((c0 >= 0xe0) && (c0 <= 0xe0))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0xa0) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else if ((c0 >= 0xe1) && (c0 <= 0xec))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else if ((c0 >= 0xed) && (c0 <= 0xed))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0x9f)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else if ((c0 >= 0xee) && (c0 <= 0xef))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
}
else
{
return 0;
}
}
else if (extraBytesToRead == 3)
{
const u8 c0 = src_ptr[src_pos + 0];
if ((c0 >= 0xf0) && (c0 <= 0xf0))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x90) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
const u8 c3 = src_ptr[src_pos + 3]; if ((c3 < 0x80) || (c3 > 0xbf)) return 0;
}
else if ((c0 >= 0xf1) && (c0 <= 0xf3))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
const u8 c3 = src_ptr[src_pos + 3]; if ((c3 < 0x80) || (c3 > 0xbf)) return 0;
}
else if ((c0 >= 0xf4) && (c0 <= 0xf4))
{
const u8 c1 = src_ptr[src_pos + 1]; if ((c1 < 0x80) || (c1 > 0xbf)) return 0;
const u8 c2 = src_ptr[src_pos + 2]; if ((c2 < 0x80) || (c2 > 0xbf)) return 0;
const u8 c3 = src_ptr[src_pos + 3]; if ((c3 < 0x80) || (c3 > 0xbf)) return 0;
}
else
{
return 0;
}
}
return 1;
}
// Input buffer and Output buffer size has to be multiple of 4 and at least of size 4.
// The output buffer is not zero padded, so entire buffer has to be set all zero before entering this function or truncated afterwards.
@ -2066,6 +2221,12 @@ DECLSPEC int hc_enc_next (hc_enc_t *hc_enc, const u32 *src_buf, const int src_le
int src_pos = hc_enc->pos;
#if VENDOR_ID == 8
// Work around segmentation fault in Intel JiT
// Tested with 2021.12.6.0.19_160000
volatile
#endif
int dst_pos = hc_enc->clen;
dst_buf[0] = hc_enc->cbuf;
@ -2109,6 +2270,15 @@ DECLSPEC int hc_enc_next (hc_enc_t *hc_enc, const u32 *src_buf, const int src_le
return dst_pos;
}
if (hc_enc_validate_utf8 (src_buf, src_pos, extraBytesToRead) == 0)
{
// broken input
hc_enc->pos = src_len;
return dst_pos;
}
u32 ch = 0;
switch (extraBytesToRead)
@ -2197,6 +2367,12 @@ DECLSPEC int hc_enc_next_global (hc_enc_t *hc_enc, GLOBAL_AS const u32 *src_buf,
int src_pos = hc_enc->pos;
#if VENDOR_ID == 8
// Work around segmentation fault in Intel JiT
// Tested with 2021.12.6.0.19_160000
volatile
#endif
int dst_pos = hc_enc->clen;
dst_buf[0] = hc_enc->cbuf;
@ -2240,6 +2416,15 @@ DECLSPEC int hc_enc_next_global (hc_enc_t *hc_enc, GLOBAL_AS const u32 *src_buf,
return dst_pos;
}
if (hc_enc_validate_utf8_global (src_buf, src_pos, extraBytesToRead) == 0)
{
// broken input
hc_enc->pos = src_len;
return dst_pos;
}
u32 ch = 0;
switch (extraBytesToRead)
@ -2300,6 +2485,8 @@ DECLSPEC int hc_enc_next_global (hc_enc_t *hc_enc, GLOBAL_AS const u32 *src_buf,
if ((dst_pos + 2) == dst_sz)
{
// this section seems to break intel opencl runtime but is unknown why
dst_ptr[dst_pos++] = (a >> 0) & 0xff;
dst_ptr[dst_pos++] = (a >> 8) & 0xff;
@ -2767,7 +2954,7 @@ DECLSPEC void make_utf16be (const u32x *in, u32x *out1, u32x *out2)
out1[1] = hc_byte_perm (in[0], 0, 0x3727);
out1[0] = hc_byte_perm (in[0], 0, 0x1707);
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM == 1
out2[3] = hc_byte_perm (in[3], 0, 0x03070207);
out2[2] = hc_byte_perm (in[3], 0, 0x01070007);
@ -2805,7 +2992,7 @@ DECLSPEC void make_utf16beN (const u32x *in, u32x *out1, u32x *out2)
out1[1] = hc_byte_perm (in[0], 0, 0x1707);
out1[0] = hc_byte_perm (in[0], 0, 0x3727);
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM == 1
out2[3] = hc_byte_perm (in[3], 0, 0x01070007);
out2[2] = hc_byte_perm (in[3], 0, 0x03070207);
@ -2843,7 +3030,7 @@ DECLSPEC void make_utf16le (const u32x *in, u32x *out1, u32x *out2)
out1[1] = hc_byte_perm (in[0], 0, 0x7372);
out1[0] = hc_byte_perm (in[0], 0, 0x7170);
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM == 1
out2[3] = hc_byte_perm (in[3], 0, 0x07030702);
out2[2] = hc_byte_perm (in[3], 0, 0x07010700);
@ -2881,7 +3068,7 @@ DECLSPEC void make_utf16leN (const u32x *in, u32x *out1, u32x *out2)
out1[1] = hc_byte_perm (in[0], 0, 0x7170);
out1[0] = hc_byte_perm (in[0], 0, 0x7372);
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM == 1
out2[3] = hc_byte_perm (in[3], 0, 0x07010700);
out2[2] = hc_byte_perm (in[3], 0, 0x07030702);
@ -2915,7 +3102,7 @@ DECLSPEC void undo_utf16be (const u32x *in1, const u32x *in2, u32x *out)
out[2] = hc_byte_perm (in2[0], in2[1], 0x4602);
out[3] = hc_byte_perm (in2[2], in2[3], 0x4602);
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM == 1
out[0] = hc_byte_perm (in1[0], in1[1], 0x04060002);
out[1] = hc_byte_perm (in1[2], in1[3], 0x04060002);
@ -2945,7 +3132,7 @@ DECLSPEC void undo_utf16le (const u32x *in1, const u32x *in2, u32x *out)
out[2] = hc_byte_perm (in2[0], in2[1], 0x6420);
out[3] = hc_byte_perm (in2[2], in2[3], 0x6420);
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM == 1
out[0] = hc_byte_perm (in1[0], in1[1], 0x06040200);
out[1] = hc_byte_perm (in1[2], in1[3], 0x06040200);
@ -36454,6 +36641,20 @@ DECLSPEC void append_0x01_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 o
append_helper_1x4_S (w3, ((offset16 == 3) ? 0x01010101 : 0), v);
}
DECLSPEC void append_0x2d_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset)
{
u32 v[4];
set_mark_1x4_S (v, offset);
const u32 offset16 = offset / 16;
append_helper_1x4_S (w0, ((offset16 == 0) ? 0x2d2d2d2d : 0), v);
append_helper_1x4_S (w1, ((offset16 == 1) ? 0x2d2d2d2d : 0), v);
append_helper_1x4_S (w2, ((offset16 == 2) ? 0x2d2d2d2d : 0), v);
append_helper_1x4_S (w3, ((offset16 == 3) ? 0x2d2d2d2d : 0), v);
}
DECLSPEC void append_0x80_1x4_S (u32 *w0, const u32 offset)
{
u32 v[4];
@ -36533,7 +36734,7 @@ DECLSPEC void make_utf16be_S (const u32 *in, u32 *out1, u32 *out2)
out1[1] = hc_byte_perm_S (in[0], 0, 0x3727);
out1[0] = hc_byte_perm_S (in[0], 0, 0x1707);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
out2[3] = hc_byte_perm_S (in[3], 0, 0x03070207);
out2[2] = hc_byte_perm_S (in[3], 0, 0x01070007);
@ -36571,7 +36772,7 @@ DECLSPEC void make_utf16le_S (const u32 *in, u32 *out1, u32 *out2)
out1[1] = hc_byte_perm_S (in[0], 0, 0x7372);
out1[0] = hc_byte_perm_S (in[0], 0, 0x7170);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
out2[3] = hc_byte_perm_S (in[3], 0, 0x07030702);
out2[2] = hc_byte_perm_S (in[3], 0, 0x07010700);
@ -36605,7 +36806,7 @@ DECLSPEC void undo_utf16be_S (const u32 *in1, const u32 *in2, u32 *out)
out[2] = hc_byte_perm_S (in2[0], in2[1], 0x4602);
out[3] = hc_byte_perm_S (in2[2], in2[3], 0x4602);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
out[0] = hc_byte_perm_S (in1[0], in1[1], 0x04060002);
out[1] = hc_byte_perm_S (in1[2], in1[3], 0x04060002);
@ -36635,7 +36836,7 @@ DECLSPEC void undo_utf16le_S (const u32 *in1, const u32 *in2, u32 *out)
out[2] = hc_byte_perm_S (in2[0], in2[1], 0x6420);
out[3] = hc_byte_perm_S (in2[2], in2[3], 0x6420);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
out[0] = hc_byte_perm_S (in1[0], in1[1], 0x06040200);
out[1] = hc_byte_perm_S (in1[2], in1[3], 0x06040200);
@ -68749,3 +68950,63 @@ DECLSPEC void append_0x80_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const
#endif
}
DECLSPEC void append_0x2d_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32x offset)
{
#if VECT_SIZE == 1
append_0x2d_4x4_S (w0, w1, w2, w3, offset);
#else
u32 t0[4];
u32 t1[4];
u32 t2[4];
u32 t3[4];
#endif
#if VECT_SIZE == 2
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
#elif VECT_SIZE == 4
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
#elif VECT_SIZE == 8
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
#elif VECT_SIZE == 16
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x2d_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x2d_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x2d_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x2d_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x2d_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x2d_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x2d_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
#endif
}

@ -290,6 +290,7 @@ DECLSPEC void append_helper_1x4_S (u32 *r, const u32 v, const u32 *m);
DECLSPEC void append_0x01_2x4_S (u32 *w0, u32 *w1, const u32 offset);
DECLSPEC void append_0x06_2x4_S (u32 *w0, u32 *w1, const u32 offset);
DECLSPEC void append_0x01_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset);
DECLSPEC void append_0x2d_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset);
DECLSPEC void append_0x80_1x4_S (u32 *w0, const u32 offset);
DECLSPEC void append_0x80_2x4_S (u32 *w0, u32 *w1, const u32 offset);
DECLSPEC void append_0x80_3x4_S (u32 *w0, u32 *w1, u32 *w2, const u32 offset);
@ -316,5 +317,6 @@ DECLSPEC void append_0x01_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const
DECLSPEC void append_0x06_2x4_VV (u32x *w0, u32x *w1, const u32x offset);
DECLSPEC void append_0x80_2x4_VV (u32x *w0, u32x *w1, const u32x offset);
DECLSPEC void append_0x80_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32x offset);
DECLSPEC void append_0x2d_4x4_VV (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32x offset);
#endif

@ -124,7 +124,9 @@ DECLSPEC u32 sub (u32 *r, const u32 *a, const u32 *b)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
// HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
//#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
#elif 0
__asm__ __volatile__
(
"V_SUB_U32 %0, %9, %17;"
@ -176,7 +178,9 @@ DECLSPEC u32 add (u32 *r, const u32 *a, const u32 *b)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif (defined IS_AMD || defined IS_HIP) && HAS_VADD == 1 && HAS_VADDC == 1
// HIP doesnt support these so we stick to OpenCL (aka IS_AMD) - is also faster without asm
//#elif (defined IS_AMD || defined IS_HIP) && HAS_VSUB == 1 && HAS_VSUBB == 1
#elif 0
__asm__ __volatile__
(
"V_ADD_U32 %0, %9, %17;"

@ -24,7 +24,7 @@ DECLSPEC u64 blake2b_rot16_S (const u64 a)
return out.v64;
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
vconv64_t in;
@ -98,7 +98,7 @@ DECLSPEC u64 blake2b_rot24_S (const u64 a)
return out.v64;
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
vconv64_t in;
@ -113,7 +113,7 @@ DECLSPEC u64 blake2b_rot24_S (const u64 a)
#else
return hc_rotr64_S (a, 16);
return hc_rotr64_S (a, 24);
#endif
}

@ -1502,14 +1502,14 @@ DECLSPEC void sha384_hmac_init_128 (sha384_hmac_ctx_t *ctx, const u32 *w0, const
// opad
u32x b0[4];
u32x b1[4];
u32x b2[4];
u32x b3[4];
u32x b4[4];
u32x b5[4];
u32x b6[4];
u32x b7[4];
u32 b0[4];
u32 b1[4];
u32 b2[4];
u32 b3[4];
u32 b4[4];
u32 b5[4];
u32 b6[4];
u32 b7[4];
b0[0] = w0[0] ^ 0x5c5c5c5c;
b0[1] = w0[1] ^ 0x5c5c5c5c;
@ -3009,14 +3009,14 @@ DECLSPEC void sha384_hmac_init_vector_128 (sha384_hmac_ctx_vector_t *ctx, const
// opad
u32 b0[4];
u32 b1[4];
u32 b2[4];
u32 b3[4];
u32 b4[4];
u32 b5[4];
u32 b6[4];
u32 b7[4];
u32x b0[4];
u32x b1[4];
u32x b2[4];
u32x b3[4];
u32x b4[4];
u32x b5[4];
u32x b6[4];
u32x b7[4];
b0[0] = w0[0] ^ 0x5c5c5c5c;
b0[1] = w0[1] ^ 0x5c5c5c5c;

@ -48,9 +48,9 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
DECLSPEC u32 hc_atomic_or (volatile GLOBAL_AS u32 *p, volatile const u32 val);
DECLSPEC size_t get_global_id (const u32 dimindx __attribute__((unused)));
DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)));
DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)));
DECLSPEC size_t get_global_id (const u32 dimindx);
DECLSPEC size_t get_local_id (const u32 dimindx);
DECLSPEC size_t get_local_size (const u32 dimindx);
DECLSPEC u32x rotl32 (const u32x a, const int n);
DECLSPEC u32x rotr32 (const u32x a, const int n);

@ -269,6 +269,35 @@ DECLSPEC int mangle_toggle_at (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p
return (len);
}
DECLSPEC int mangle_toggle_at_sep (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len)
{
if (len >= RP_PASSWORD_SIZE) return (len);
u8 occurence = 0;
u32 rem = 0;
for (int i = 0, idx = 0; i < len; i += 4, idx += 1)
{
const u32 t = buf[idx];
buf[idx] = t | generate_cmask (t);
u32 out = rem;
rem = 0;
if (((t >> 0) & 0xff) == p1) { if (occurence == p0) out = 0x0000ff00; occurence++; }
if (((t >> 8) & 0xff) == p1) { if (occurence == p0) out = 0x00ff0000; occurence++; }
if (((t >> 16) & 0xff) == p1) { if (occurence == p0) out = 0xff000000; occurence++; }
if (((t >> 24) & 0xff) == p1) { if (occurence == p0) rem = 0x000000ff; occurence++; }
buf[idx] = t ^ (generate_cmask (t) & out);
}
return (len);
}
DECLSPEC int mangle_reverse (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len)
{
for (int l = 0; l < len / 2; l++)
@ -725,6 +754,7 @@ DECLSPEC int apply_rule (const u32 name, MAYBE_UNUSED const u8 p0, MAYBE_UNUSED
case RULE_OP_MANGLE_UREST_LFIRST: out_len = mangle_urest_lfirst (p0, p1, buf, out_len); break;
case RULE_OP_MANGLE_TREST: out_len = mangle_trest (p0, p1, buf, out_len); break;
case RULE_OP_MANGLE_TOGGLE_AT: out_len = mangle_toggle_at (p0, p1, buf, out_len); break;
case RULE_OP_MANGLE_TOGGLE_AT_SEP: out_len = mangle_toggle_at_sep (p0, p1, buf, out_len); break;
case RULE_OP_MANGLE_REVERSE: out_len = mangle_reverse (p0, p1, buf, out_len); break;
case RULE_OP_MANGLE_DUPEWORD: out_len = mangle_dupeword (p0, p1, buf, out_len); break;
case RULE_OP_MANGLE_DUPEWORD_TIMES: out_len = mangle_dupeword_times (p0, p1, (u8 *) buf, out_len); break;

@ -21,6 +21,7 @@
#define RULE_OP_MANGLE_UREST_LFIRST 'C'
#define RULE_OP_MANGLE_TREST 't'
#define RULE_OP_MANGLE_TOGGLE_AT 'T'
#define RULE_OP_MANGLE_TOGGLE_AT_SEP '3'
#define RULE_OP_MANGLE_REVERSE 'r'
#define RULE_OP_MANGLE_DUPEWORD 'd'
#define RULE_OP_MANGLE_DUPEWORD_TIMES 'p'
@ -83,6 +84,7 @@ DECLSPEC int mangle_urest (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u
DECLSPEC int mangle_urest_lfirst (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len);
DECLSPEC int mangle_trest (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len);
DECLSPEC int mangle_toggle_at (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len);
DECLSPEC int mangle_toggle_at_sep (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len);
DECLSPEC int mangle_reverse (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len);
DECLSPEC int mangle_dupeword (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len);
DECLSPEC int mangle_dupeword_times (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u8 *buf, const int len);

@ -1163,6 +1163,82 @@ DECLSPEC u32 rule_op_mangle_toggle_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED c
return (in_len);
}
DECLSPEC u32 rule_op_mangle_toggle_at_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
{
if (in_len == 0) return in_len;
u32 r0 = search_on_register (buf0[0], p1);
u32 r1 = search_on_register (buf0[1], p1);
u32 r2 = search_on_register (buf0[2], p1);
u32 r3 = search_on_register (buf0[3], p1);
u32 r4 = search_on_register (buf1[0], p1);
u32 r5 = search_on_register (buf1[1], p1);
u32 r6 = search_on_register (buf1[2], p1);
u32 r7 = search_on_register (buf1[3], p1);
const u32 rn = (r0 << 0)
| (r1 << 4)
| (r2 << 8)
| (r3 << 12)
| (r4 << 16)
| (r5 << 20)
| (r6 << 24)
| (r7 << 28);
if (rn == 0) return in_len;
u32 occurence = 0;
u32 ro = 0;
#ifdef _unroll
#pragma unroll
#endif
for (int i = 0; i < 32; i++)
{
if ((rn >> i) & 1)
{
if (occurence == p0)
{
ro = 1 << i;
break;
}
occurence++;
}
}
r0 = (ro >> 0) & 15;
r1 = (ro >> 4) & 15;
r2 = (ro >> 8) & 15;
r3 = (ro >> 12) & 15;
r4 = (ro >> 16) & 15;
r5 = (ro >> 20) & 15;
r6 = (ro >> 24) & 15;
r7 = (ro >> 28) & 15;
r0 <<= 1;
r1 <<= 1; r1 |= r0 >> 4;
r2 <<= 1; r2 |= r1 >> 4;
r3 <<= 1; r3 |= r2 >> 4;
r4 <<= 1; r4 |= r3 >> 4;
r5 <<= 1; r5 |= r4 >> 4;
r6 <<= 1; r6 |= r5 >> 4;
r7 <<= 1; r7 |= r6 >> 4;
buf0[0] = toggle_on_register (buf0[0], r0);
buf0[1] = toggle_on_register (buf0[1], r1);
buf0[2] = toggle_on_register (buf0[2], r2);
buf0[3] = toggle_on_register (buf0[3], r3);
buf1[0] = toggle_on_register (buf1[0], r4);
buf1[1] = toggle_on_register (buf1[1], r5);
buf1[2] = toggle_on_register (buf1[2], r6);
buf1[3] = toggle_on_register (buf1[3], r7);
return in_len;
}
DECLSPEC u32 rule_op_mangle_reverse (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
{
reverse_block_optimized (buf0, buf1, buf0, buf1, in_len);
@ -2285,6 +2361,7 @@ DECLSPEC u32 apply_rule_optimized (const u32 name, const u32 p0, const u32 p1, u
case RULE_OP_MANGLE_UREST_LFIRST: out_len = rule_op_mangle_urest_lfirst (p0, p1, buf0, buf1, out_len); break;
case RULE_OP_MANGLE_TREST: out_len = rule_op_mangle_trest (p0, p1, buf0, buf1, out_len); break;
case RULE_OP_MANGLE_TOGGLE_AT: out_len = rule_op_mangle_toggle_at (p0, p1, buf0, buf1, out_len); break;
case RULE_OP_MANGLE_TOGGLE_AT_SEP: out_len = rule_op_mangle_toggle_at_sep (p0, p1, buf0, buf1, out_len); break;
case RULE_OP_MANGLE_REVERSE: out_len = rule_op_mangle_reverse (p0, p1, buf0, buf1, out_len); break;
case RULE_OP_MANGLE_DUPEWORD: out_len = rule_op_mangle_dupeword (p0, p1, buf0, buf1, out_len); break;
case RULE_OP_MANGLE_DUPEWORD_TIMES: out_len = rule_op_mangle_dupeword_times (p0, p1, buf0, buf1, out_len); break;

@ -21,6 +21,7 @@
#define RULE_OP_MANGLE_UREST_LFIRST 'C'
#define RULE_OP_MANGLE_TREST 't'
#define RULE_OP_MANGLE_TOGGLE_AT 'T'
#define RULE_OP_MANGLE_TOGGLE_AT_SEP '3'
#define RULE_OP_MANGLE_REVERSE 'r'
#define RULE_OP_MANGLE_DUPEWORD 'd'
#define RULE_OP_MANGLE_DUPEWORD_TIMES 'p'
@ -85,6 +86,7 @@ DECLSPEC u32 rule_op_mangle_lrest_ufirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSE
DECLSPEC u32 rule_op_mangle_urest_lfirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);
DECLSPEC u32 rule_op_mangle_trest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);
DECLSPEC u32 rule_op_mangle_toggle_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);
DECLSPEC u32 rule_op_mangle_toggle_at_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);
DECLSPEC u32 rule_op_mangle_reverse (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);
DECLSPEC u32 rule_op_mangle_dupeword (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);
DECLSPEC u32 rule_op_mangle_dupeword_times (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len);

@ -1049,6 +1049,18 @@
#define MATCHES_NONE_VV(a,b) !(MATCHES_ONE_VV ((a), (b)))
#define MATCHES_NONE_VS(a,b) !(MATCHES_ONE_VS ((a), (b)))
#if VECT_SIZE == 1
#define pack(arr,var,gid,idx) make_u32x ((arr)[((gid) * 1) + 0].var)
#elif VECT_SIZE == 2
#define pack(arr,var,gid,idx) make_u32x ((arr)[((gid) * 2) + 0].var, (arr)[((gid) * 2) + 1].var)
#elif VECT_SIZE == 4
#define pack(arr,var,gid,idx) make_u32x ((arr)[((gid) * 4) + 0].var, (arr)[((gid) * 4) + 1].var, (arr)[((gid) * 4) + 2].var, (arr)[((gid) * 4) + 3].var)
#elif VECT_SIZE == 8
#define pack(arr,var,gid,idx) make_u32x ((arr)[((gid) * 8) + 0].var, (arr)[((gid) * 8) + 1].var, (arr)[((gid) * 8) + 2].var, (arr)[((gid) * 8) + 3].var, (arr)[((gid) * 8) + 4].var, (arr)[((gid) * 8) + 5].var, (arr)[((gid) * 8) + 6].var, (arr)[((gid) * 8) + 7].var)
#elif VECT_SIZE == 16
#define pack(arr,var,gid,idx) make_u32x ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var)
#endif
#if VECT_SIZE == 1
#define packv(arr,var,gid,idx) make_u32x ((arr)[((gid) * 1) + 0].var[(idx)])
#elif VECT_SIZE == 2
@ -1097,6 +1109,18 @@
#define pack64vf(arr,var,gid) make_u64x ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var)
#endif
#if VECT_SIZE == 1
#define unpack(arr,var,gid,val) (arr)[((gid) * 1) + 0].var = val;
#elif VECT_SIZE == 2
#define unpack(arr,var,gid,val) (arr)[((gid) * 2) + 0].var = val.s0; (arr)[((gid) * 2) + 1].var = val.s1;
#elif VECT_SIZE == 4
#define unpack(arr,var,gid,val) (arr)[((gid) * 4) + 0].var = val.s0; (arr)[((gid) * 4) + 1].var = val.s1; (arr)[((gid) * 4) + 2].var = val.s2; (arr)[((gid) * 4) + 3].var = val.s3;
#elif VECT_SIZE == 8
#define unpack(arr,var,gid,val) (arr)[((gid) * 8) + 0].var = val.s0; (arr)[((gid) * 8) + 1].var = val.s1; (arr)[((gid) * 8) + 2].var = val.s2; (arr)[((gid) * 8) + 3].var = val.s3; (arr)[((gid) * 8) + 4].var = val.s4; (arr)[((gid) * 8) + 5].var = val.s5; (arr)[((gid) * 8) + 6].var = val.s6; (arr)[((gid) * 8) + 7].var = val.s7;
#elif VECT_SIZE == 16
#define unpack(arr,var,gid,val) (arr)[((gid) * 16) + 0].var = val.s0; (arr)[((gid) * 16) + 1].var = val.s1; (arr)[((gid) * 16) + 2].var = val.s2; (arr)[((gid) * 16) + 3].var = val.s3; (arr)[((gid) * 16) + 4].var = val.s4; (arr)[((gid) * 16) + 5].var = val.s5; (arr)[((gid) * 16) + 6].var = val.s6; (arr)[((gid) * 16) + 7].var = val.s7; (arr)[((gid) * 16) + 8].var = val.s8; (arr)[((gid) * 16) + 9].var = val.s9; (arr)[((gid) * 16) + 10].var = val.sa; (arr)[((gid) * 16) + 11].var = val.sb; (arr)[((gid) * 16) + 12].var = val.sc; (arr)[((gid) * 16) + 13].var = val.sd; (arr)[((gid) * 16) + 14].var = val.se; (arr)[((gid) * 16) + 15].var = val.sf;
#endif
#if VECT_SIZE == 1
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 1) + 0].var[(idx)] = val;
#elif VECT_SIZE == 2

@ -17,18 +17,27 @@
#endif
#ifdef IS_CUDA
//https://docs.nvidia.com/cuda/nvrtc/index.html#integer-size
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long long ulong;
// https://docs.nvidia.com/cuda/nvrtc/index.html#integer-size
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef unsigned long long ullong;
#endif
#ifdef IS_OPENCL
typedef ulong ullong;
typedef ulong2 ullong2;
typedef ulong4 ullong4;
typedef ulong8 ullong8;
typedef ulong16 ullong16;
#endif
#ifdef KERNEL_STATIC
typedef uchar u8;
typedef ushort u16;
typedef uint u32;
typedef ulong u64;
typedef ullong u64;
#else
typedef uint8_t u8;
typedef uint16_t u16;
@ -68,7 +77,7 @@ typedef u64 u64x;
#define make_u64x (u64)
#else
#if defined IS_CUDA
#if defined IS_CUDA || defined IS_HIP
#if VECT_SIZE == 2
@ -824,767 +833,11 @@ typedef __device_builtin__ struct u64x u64x;
#define make_u32x u32x
#define make_u64x u64x
#elif defined IS_HIP
#if VECT_SIZE == 2
struct u8x
{
u8 s0;
u8 s1;
inline __device__ u8x (const u8 a, const u8 b) : s0(a), s1(b) { }
inline __device__ u8x (const u8 a) : s0(a), s1(a) { }
inline __device__ u8x (void) : s0(0), s1(0) { }
inline __device__ ~u8x (void) { }
};
struct u16x
{
u16 s0;
u16 s1;
inline __device__ u16x (const u16 a, const u16 b) : s0(a), s1(b) { }
inline __device__ u16x (const u16 a) : s0(a), s1(a) { }
inline __device__ u16x (void) : s0(0), s1(0) { }
inline __device__ ~u16x (void) { }
};
struct u32x
{
u32 s0;
u32 s1;
inline __device__ u32x (const u32 a, const u32 b) : s0(a), s1(b) { }
inline __device__ u32x (const u32 a) : s0(a), s1(a) { }
inline __device__ u32x (void) : s0(0), s1(0) { }
inline __device__ ~u32x (void) { }
};
struct u64x
{
u64 s0;
u64 s1;
inline __device__ u64x (const u64 a, const u64 b) : s0(a), s1(b) { }
inline __device__ u64x (const u64 a) : s0(a), s1(a) { }
inline __device__ u64x (void) : s0(0), s1(0) { }
inline __device__ ~u64x (void) { }
};
inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b)); }
inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); }
inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; }
inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; }
inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; }
inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; }
inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; }
inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; }
inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; }
inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; }
inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; }
inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; }
inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; }
inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; }
inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; }
inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; }
inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; }
inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; }
inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) ); }
inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1)); }
inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) ); }
inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1)); }
inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) ); }
inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1)); }
inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) ); }
inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1)); }
inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) ); }
inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1)); }
inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) ); }
inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1)); }
inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) ); }
inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1)); }
inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) ); }
inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1)); }
inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) ); }
inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1)); }
inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1); }
inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b)); }
inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1)); }
inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; }
inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; }
inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; }
inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; }
inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; }
inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; }
inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; }
inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; }
inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; }
inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; }
inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; }
inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; }
inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; }
inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; }
inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; }
inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; }
inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) ); }
inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1)); }
inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) ); }
inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1)); }
inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) ); }
inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1)); }
inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) ); }
inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1)); }
inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) ); }
inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1)); }
inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) ); }
inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1)); }
inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) ); }
inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1)); }
inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) ); }
inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1)); }
inline __device__ u64x operator % (const u64x a, const u64 b) { return u64x ((a.s0 % b), (a.s1 % b) ); }
inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1)); }
inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1); }
#endif
#if VECT_SIZE == 4
struct u8x
{
u8 s0;
u8 s1;
u8 s2;
u8 s3;
inline __device__ u8x (const u8 a, const u8 b, const u8 c, const u8 d) : s0(a), s1(b), s2(c), s3(d) { }
inline __device__ u8x (const u8 a) : s0(a), s1(a), s2(a), s3(a) { }
inline __device__ u8x (void) : s0(0), s1(0), s2(0), s3(0) { }
inline __device__ ~u8x (void) { }
};
struct u16x
{
u16 s0;
u16 s1;
u16 s2;
u16 s3;
inline __device__ u16x (const u16 a, const u16 b, const u16 c, const u16 d) : s0(a), s1(b), s2(c), s3(d) { }
inline __device__ u16x (const u16 a) : s0(a), s1(a), s2(a), s3(a) { }
inline __device__ u16x (void) : s0(0), s1(0), s2(0), s3(0) { }
inline __device__ ~u16x (void) { }
};
struct u32x
{
u32 s0;
u32 s1;
u32 s2;
u32 s3;
inline __device__ u32x (const u32 a, const u32 b, const u32 c, const u32 d) : s0(a), s1(b), s2(c), s3(d) { }
inline __device__ u32x (const u32 a) : s0(a), s1(a), s2(a), s3(a) { }
inline __device__ u32x (void) : s0(0), s1(0), s2(0), s3(0) { }
inline __device__ ~u32x (void) { }
};
struct u64x
{
u64 s0;
u64 s1;
u64 s2;
u64 s3;
inline __device__ u64x (const u64 a, const u64 b, const u64 c, const u64 d) : s0(a), s1(b), s2(c), s3(d) { }
inline __device__ u64x (const u64 a) : s0(a), s1(a), s2(a), s3(a) { }
inline __device__ u64x (void) : s0(0), s1(0), s2(0), s3(0) { }
inline __device__ ~u64x (void) { }
};
inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) ); }
inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); }
inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; }
inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; }
inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; }
inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; }
inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; }
inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; }
inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; }
inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; }
inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; }
inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; }
inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; }
inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; }
inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; }
inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; }
inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; }
inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; }
inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) ); }
inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3)); }
inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) ); }
inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3)); }
inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) ); }
inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3)); }
inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) ); }
inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3)); }
inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) ); }
inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3)); }
inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) ); }
inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3)); }
inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) ); }
inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3)); }
inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) ); }
inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3)); }
inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) ); }
inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3)); }
inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3); }
inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) ); }
inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3)); }
inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; }
inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; }
inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; }
inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; }
inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; }
inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; }
inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; }
inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; }
inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; }
inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; }
inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; }
inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; }
inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; }
inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; }
inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; }
inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; }
inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) ); }
inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3)); }
inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) ); }
inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3)); }
inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) ); }
inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3)); }
inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) ); }
inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3)); }
inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) ); }
inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3)); }
inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) ); }
inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3)); }
inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) ); }
inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3)); }
inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) ); }
inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3)); }
inline __device__ u64x operator % (const u64x a, const u32 b) { return u64x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) ); }
inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3)); }
inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3); }
#endif
#if VECT_SIZE == 8
struct u8x
{
u8 s0;
u8 s1;
u8 s2;
u8 s3;
u8 s4;
u8 s5;
u8 s6;
u8 s7;
inline __device__ u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
inline __device__ u8x (const u8 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
inline __device__ u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
inline __device__ ~u8x (void) { }
};
struct u16x
{
u16 s0;
u16 s1;
u16 s2;
u16 s3;
u16 s4;
u16 s5;
u16 s6;
u16 s7;
inline __device__ u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
inline __device__ u16x (const u16 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
inline __device__ u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
inline __device__ ~u16x (void) { }
};
struct u32x
{
u32 s0;
u32 s1;
u32 s2;
u32 s3;
u32 s4;
u32 s5;
u32 s6;
u32 s7;
inline __device__ u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
inline __device__ u32x (const u32 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
inline __device__ u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
inline __device__ ~u32x (void) { }
};
struct u64x
{
u64 s0;
u64 s1;
u64 s2;
u64 s3;
u64 s4;
u64 s5;
u64 s6;
u64 s7;
inline __device__ u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h) { }
inline __device__ u64x (const u64 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a) { }
inline __device__ u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0) { }
inline __device__ ~u64x (void) { }
};
inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) ); }
inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); }
inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; }
inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; }
inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; }
inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; }
inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; }
inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; }
inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; }
inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; }
inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; }
inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; }
inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; }
inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; }
inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; }
inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; }
inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; }
inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; }
inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b) ); }
inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7)); }
inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b) ); }
inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7)); }
inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b) ); }
inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7)); }
inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b) ); }
inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7)); }
inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b) ); }
inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7)); }
inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b) ); }
inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7)); }
inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b) ); }
inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7)); }
inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b) ); }
inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7)); }
inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b) ); }
inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7)); }
inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); }
inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) ); }
inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7)); }
inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; }
inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; }
inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; }
inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; }
inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; }
inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; }
inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; }
inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; }
inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; }
inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; }
inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; }
inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; }
inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; }
inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; }
inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; }
inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; }
inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b) ); }
inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7)); }
inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b) ); }
inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7)); }
inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b) ); }
inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7)); }
inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b) ); }
inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7)); }
inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b) ); }
inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7)); }
inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b) ); }
inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7)); }
inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b) ); }
inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7)); }
inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b) ); }
inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7)); }
inline __device__ u64x operator % (const u64x a, const u64 b) { return u64x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b) ); }
inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7)); }
inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7); }
#endif
#if VECT_SIZE == 16
struct u8x
{
u8 s0;
u8 s1;
u8 s2;
u8 s3;
u8 s4;
u8 s5;
u8 s6;
u8 s7;
u8 s8;
u8 s9;
u8 sa;
u8 sb;
u8 sc;
u8 sd;
u8 se;
u8 sf;
inline __device__ u8x (const u8 a, const u8 b, const u8 c, const u8 d, const u8 e, const u8 f, const u8 g, const u8 h, const u8 i, const u8 j, const u8 k, const u8 l, const u8 m, const u8 n, const u8 o, const u8 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
inline __device__ u8x (const u8 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
inline __device__ u8x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { }
inline __device__ ~u8x (void) { }
};
struct u16x
{
u16 s0;
u16 s1;
u16 s2;
u16 s3;
u16 s4;
u16 s5;
u16 s6;
u16 s7;
u16 s8;
u16 s9;
u16 sa;
u16 sb;
u16 sc;
u16 sd;
u16 se;
u16 sf;
inline __device__ u16x (const u16 a, const u16 b, const u16 c, const u16 d, const u16 e, const u16 f, const u16 g, const u16 h, const u16 i, const u16 j, const u16 k, const u16 l, const u16 m, const u16 n, const u16 o, const u16 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
inline __device__ u16x (const u16 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
inline __device__ u16x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ }
inline __device__ ~u16x (void) { }
};
struct u32x
{
u32 s0;
u32 s1;
u32 s2;
u32 s3;
u32 s4;
u32 s5;
u32 s6;
u32 s7;
u32 s8;
u32 s9;
u32 sa;
u32 sb;
u32 sc;
u32 sd;
u32 se;
u32 sf;
inline __device__ u32x (const u32 a, const u32 b, const u32 c, const u32 d, const u32 e, const u32 f, const u32 g, const u32 h, const u32 i, const u32 j, const u32 k, const u32 l, const u32 m, const u32 n, const u32 o, const u32 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
inline __device__ u32x (const u32 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
inline __device__ u32x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0){ }
inline __device__ ~u32x (void) { }
};
struct u64x
{
u64 s0;
u64 s1;
u64 s2;
u64 s3;
u64 s4;
u64 s5;
u64 s6;
u64 s7;
u64 s8;
u64 s9;
u64 sa;
u64 sb;
u64 sc;
u64 sd;
u64 se;
u64 sf;
inline __device__ u64x (const u64 a, const u64 b, const u64 c, const u64 d, const u64 e, const u64 f, const u64 g, const u64 h, const u64 i, const u64 j, const u64 k, const u64 l, const u64 m, const u64 n, const u64 o, const u64 p) : s0(a), s1(b), s2(c), s3(d), s4(e), s5(f), s6(g), s7(h), s8(i), s9(j), sa(k), sb(l), sc(m), sd(n), se(o), sf(p) { }
inline __device__ u64x (const u64 a) : s0(a), s1(a), s2(a), s3(a), s4(a), s5(a), s6(a), s7(a), s8(a), s9(a), sa(a), sb(a), sc(a), sd(a), se(a), sf(a) { }
inline __device__ u64x (void) : s0(0), s1(0), s2(0), s3(0), s4(0), s5(0), s6(0), s7(0), s8(0), s9(0), sa(0), sb(0), sc(0), sd(0), se(0), sf(0) { }
inline __device__ ~u64x (void) { }
};
inline __device__ bool operator != (const u32x a, const u32 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) && (a.s8 != b) && (a.s9 != b) && (a.sa != b) && (a.sb != b) && (a.sc != b) && (a.sd != b) && (a.se != b) && (a.sf != b) ); }
inline __device__ bool operator != (const u32x a, const u32x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); }
inline __device__ void operator ^= (u32x &a, const u32 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; a.s8 ^= b; a.s9 ^= b; a.sa ^= b; a.sb ^= b; a.sc ^= b; a.sd ^= b; a.se ^= b; a.sf ^= b; }
inline __device__ void operator ^= (u32x &a, const u32x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; }
inline __device__ void operator |= (u32x &a, const u32 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; a.s8 |= b; a.s9 |= b; a.sa |= b; a.sb |= b; a.sc |= b; a.sd |= b; a.se |= b; a.sf |= b; }
inline __device__ void operator |= (u32x &a, const u32x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; }
inline __device__ void operator &= (u32x &a, const u32 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; a.s8 &= b; a.s9 &= b; a.sa &= b; a.sb &= b; a.sc &= b; a.sd &= b; a.se &= b; a.sf &= b; }
inline __device__ void operator &= (u32x &a, const u32x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; }
inline __device__ void operator += (u32x &a, const u32 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; a.s8 += b; a.s9 += b; a.sa += b; a.sb += b; a.sc += b; a.sd += b; a.se += b; a.sf += b; }
inline __device__ void operator += (u32x &a, const u32x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; }
inline __device__ void operator -= (u32x &a, const u32 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; a.s8 -= b; a.s9 -= b; a.sa -= b; a.sb -= b; a.sc -= b; a.sd -= b; a.se -= b; a.sf -= b; }
inline __device__ void operator -= (u32x &a, const u32x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; }
inline __device__ void operator *= (u32x &a, const u32 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; a.s8 *= b; a.s9 *= b; a.sa *= b; a.sb *= b; a.sc *= b; a.sd *= b; a.se *= b; a.sf *= b; }
inline __device__ void operator *= (u32x &a, const u32x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; }
inline __device__ void operator >>= (u32x &a, const u32 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; a.s8 >>= b; a.s9 >>= b; a.sa >>= b; a.sb >>= b; a.sc >>= b; a.sd >>= b; a.se >>= b; a.sf >>= b; }
inline __device__ void operator >>= (u32x &a, const u32x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; }
inline __device__ void operator <<= (u32x &a, const u32 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; a.s8 <<= b; a.s9 <<= b; a.sa <<= b; a.sb <<= b; a.sc <<= b; a.sd <<= b; a.se <<= b; a.sf <<= b; }
inline __device__ void operator <<= (u32x &a, const u32x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; }
inline __device__ u32x operator << (const u32x a, const u32 b) { return u32x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b), (a.s8 << b), (a.s9 << b) , (a.sa << b), (a.sb << b) , (a.sc << b), (a.sd << b) , (a.se << b), (a.sf << b) ); }
inline __device__ u32x operator << (const u32x a, const u32x b) { return u32x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf)); }
inline __device__ u32x operator >> (const u32x a, const u32 b) { return u32x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b), (a.s8 >> b), (a.s9 >> b) , (a.sa >> b), (a.sb >> b) , (a.sc >> b), (a.sd >> b) , (a.se >> b), (a.sf >> b) ); }
inline __device__ u32x operator >> (const u32x a, const u32x b) { return u32x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf)); }
inline __device__ u32x operator ^ (const u32x a, const u32 b) { return u32x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b), (a.s8 ^ b), (a.s9 ^ b) , (a.sa ^ b), (a.sb ^ b) , (a.sc ^ b), (a.sd ^ b) , (a.se ^ b), (a.sf ^ b) ); }
inline __device__ u32x operator ^ (const u32x a, const u32x b) { return u32x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7), (a.s8 ^ b.s8), (a.s9 ^ b.s9), (a.sa ^ b.sa), (a.sb ^ b.sb), (a.sc ^ b.sc), (a.sd ^ b.sd), (a.se ^ b.se), (a.sf ^ b.sf)); }
inline __device__ u32x operator | (const u32x a, const u32 b) { return u32x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b), (a.s8 | b), (a.s9 | b) , (a.sa | b), (a.sb | b) , (a.sc | b), (a.sd | b) , (a.se | b), (a.sf | b) ); }
inline __device__ u32x operator | (const u32x a, const u32x b) { return u32x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7), (a.s8 | b.s8), (a.s9 | b.s9), (a.sa | b.sa), (a.sb | b.sb), (a.sc | b.sc), (a.sd | b.sd), (a.se | b.se), (a.sf | b.sf)); }
inline __device__ u32x operator & (const u32x a, const u32 b) { return u32x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b), (a.s8 & b), (a.s9 & b) , (a.sa & b), (a.sb & b) , (a.sc & b), (a.sd & b) , (a.se & b), (a.sf & b) ); }
inline __device__ u32x operator & (const u32x a, const u32x b) { return u32x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7), (a.s8 & b.s8), (a.s9 & b.s9), (a.sa & b.sa), (a.sb & b.sb), (a.sc & b.sc), (a.sd & b.sd), (a.se & b.se), (a.sf & b.sf)); }
inline __device__ u32x operator + (const u32x a, const u32 b) { return u32x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b), (a.s8 + b), (a.s9 + b) , (a.sa + b), (a.sb + b) , (a.sc + b), (a.sd + b) , (a.se + b), (a.sf + b) ); }
inline __device__ u32x operator + (const u32x a, const u32x b) { return u32x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7), (a.s8 + b.s8), (a.s9 + b.s9), (a.sa + b.sa), (a.sb + b.sb), (a.sc + b.sc), (a.sd + b.sd), (a.se + b.se), (a.sf + b.sf)); }
inline __device__ u32x operator - (const u32x a, const u32 b) { return u32x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b), (a.s8 - b), (a.s9 - b) , (a.sa - b), (a.sb - b) , (a.sc - b), (a.sd - b) , (a.se - b), (a.sf - b) ); }
inline __device__ u32x operator - (const u32x a, const u32x b) { return u32x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7), (a.s8 - b.s8), (a.s9 - b.s9), (a.sa - b.sa), (a.sb - b.sb), (a.sc - b.sc), (a.sd - b.sd), (a.se - b.se), (a.sf - b.sf)); }
inline __device__ u32x operator * (const u32x a, const u32 b) { return u32x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b), (a.s8 * b), (a.s9 * b) , (a.sa * b), (a.sb * b) , (a.sc * b), (a.sd * b) , (a.se * b), (a.sf * b) ); }
inline __device__ u32x operator * (const u32x a, const u32x b) { return u32x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7), (a.s8 * b.s8), (a.s9 * b.s9), (a.sa * b.sa), (a.sb * b.sb), (a.sc * b.sc), (a.sd * b.sd), (a.se * b.se), (a.sf * b.sf)); }
inline __device__ u32x operator % (const u32x a, const u32 b) { return u32x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b), (a.s8 % b), (a.s9 % b) , (a.sa % b), (a.sb % b) , (a.sc % b), (a.sd % b) , (a.se % b), (a.sf % b) ); }
inline __device__ u32x operator % (const u32x a, const u32x b) { return u32x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7), (a.s8 % b.s8), (a.s9 % b.s9), (a.sa % b.sa), (a.sb % b.sb), (a.sc % b.sc), (a.sd % b.sd), (a.se % b.se), (a.sf % b.sf)); }
inline __device__ u32x operator ~ (const u32x a) { return u32x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); }
inline __device__ bool operator != (const u64x a, const u64 b) { return ((a.s0 != b) && (a.s1 != b) && (a.s2 != b) && (a.s3 != b) && (a.s4 != b) && (a.s5 != b) && (a.s6 != b) && (a.s7 != b) && (a.s8 != b) && (a.s9 != b) && (a.sa != b) && (a.sb != b) && (a.sc != b) && (a.sd != b) && (a.se != b) && (a.sf != b) ); }
inline __device__ bool operator != (const u64x a, const u64x b) { return ((a.s0 != b.s0) && (a.s1 != b.s1) && (a.s2 != b.s2) && (a.s3 != b.s3) && (a.s4 != b.s4) && (a.s5 != b.s5) && (a.s6 != b.s6) && (a.s7 != b.s7) && (a.s8 != b.s8) && (a.s9 != b.s9) && (a.sa != b.sa) && (a.sb != b.sb) && (a.sc != b.sc) && (a.sd != b.sd) && (a.se != b.se) && (a.sf != b.sf)); }
inline __device__ void operator ^= (u64x &a, const u64 b) { a.s0 ^= b; a.s1 ^= b; a.s2 ^= b; a.s3 ^= b; a.s4 ^= b; a.s5 ^= b; a.s6 ^= b; a.s7 ^= b; a.s8 ^= b; a.s9 ^= b; a.sa ^= b; a.sb ^= b; a.sc ^= b; a.sd ^= b; a.se ^= b; a.sf ^= b; }
inline __device__ void operator ^= (u64x &a, const u64x b) { a.s0 ^= b.s0; a.s1 ^= b.s1; a.s2 ^= b.s2; a.s3 ^= b.s3; a.s4 ^= b.s4; a.s5 ^= b.s5; a.s6 ^= b.s6; a.s7 ^= b.s7; a.s8 ^= b.s8; a.s9 ^= b.s9; a.sa ^= b.sa; a.sb ^= b.sb; a.sc ^= b.sc; a.sd ^= b.sd; a.se ^= b.se; a.sf ^= b.sf; }
inline __device__ void operator |= (u64x &a, const u64 b) { a.s0 |= b; a.s1 |= b; a.s2 |= b; a.s3 |= b; a.s4 |= b; a.s5 |= b; a.s6 |= b; a.s7 |= b; a.s8 |= b; a.s9 |= b; a.sa |= b; a.sb |= b; a.sc |= b; a.sd |= b; a.se |= b; a.sf |= b; }
inline __device__ void operator |= (u64x &a, const u64x b) { a.s0 |= b.s0; a.s1 |= b.s1; a.s2 |= b.s2; a.s3 |= b.s3; a.s4 |= b.s4; a.s5 |= b.s5; a.s6 |= b.s6; a.s7 |= b.s7; a.s8 |= b.s8; a.s9 |= b.s9; a.sa |= b.sa; a.sb |= b.sb; a.sc |= b.sc; a.sd |= b.sd; a.se |= b.se; a.sf |= b.sf; }
inline __device__ void operator &= (u64x &a, const u64 b) { a.s0 &= b; a.s1 &= b; a.s2 &= b; a.s3 &= b; a.s4 &= b; a.s5 &= b; a.s6 &= b; a.s7 &= b; a.s8 &= b; a.s9 &= b; a.sa &= b; a.sb &= b; a.sc &= b; a.sd &= b; a.se &= b; a.sf &= b; }
inline __device__ void operator &= (u64x &a, const u64x b) { a.s0 &= b.s0; a.s1 &= b.s1; a.s2 &= b.s2; a.s3 &= b.s3; a.s4 &= b.s4; a.s5 &= b.s5; a.s6 &= b.s6; a.s7 &= b.s7; a.s8 &= b.s8; a.s9 &= b.s9; a.sa &= b.sa; a.sb &= b.sb; a.sc &= b.sc; a.sd &= b.sd; a.se &= b.se; a.sf &= b.sf; }
inline __device__ void operator += (u64x &a, const u64 b) { a.s0 += b; a.s1 += b; a.s2 += b; a.s3 += b; a.s4 += b; a.s5 += b; a.s6 += b; a.s7 += b; a.s8 += b; a.s9 += b; a.sa += b; a.sb += b; a.sc += b; a.sd += b; a.se += b; a.sf += b; }
inline __device__ void operator += (u64x &a, const u64x b) { a.s0 += b.s0; a.s1 += b.s1; a.s2 += b.s2; a.s3 += b.s3; a.s4 += b.s4; a.s5 += b.s5; a.s6 += b.s6; a.s7 += b.s7; a.s8 += b.s8; a.s9 += b.s9; a.sa += b.sa; a.sb += b.sb; a.sc += b.sc; a.sd += b.sd; a.se += b.se; a.sf += b.sf; }
inline __device__ void operator -= (u64x &a, const u64 b) { a.s0 -= b; a.s1 -= b; a.s2 -= b; a.s3 -= b; a.s4 -= b; a.s5 -= b; a.s6 -= b; a.s7 -= b; a.s8 -= b; a.s9 -= b; a.sa -= b; a.sb -= b; a.sc -= b; a.sd -= b; a.se -= b; a.sf -= b; }
inline __device__ void operator -= (u64x &a, const u64x b) { a.s0 -= b.s0; a.s1 -= b.s1; a.s2 -= b.s2; a.s3 -= b.s3; a.s4 -= b.s4; a.s5 -= b.s5; a.s6 -= b.s6; a.s7 -= b.s7; a.s8 -= b.s8; a.s9 -= b.s9; a.sa -= b.sa; a.sb -= b.sb; a.sc -= b.sc; a.sd -= b.sd; a.se -= b.se; a.sf -= b.sf; }
inline __device__ void operator *= (u64x &a, const u64 b) { a.s0 *= b; a.s1 *= b; a.s2 *= b; a.s3 *= b; a.s4 *= b; a.s5 *= b; a.s6 *= b; a.s7 *= b; a.s8 *= b; a.s9 *= b; a.sa *= b; a.sb *= b; a.sc *= b; a.sd *= b; a.se *= b; a.sf *= b; }
inline __device__ void operator *= (u64x &a, const u64x b) { a.s0 *= b.s0; a.s1 *= b.s1; a.s2 *= b.s2; a.s3 *= b.s3; a.s4 *= b.s4; a.s5 *= b.s5; a.s6 *= b.s6; a.s7 *= b.s7; a.s8 *= b.s8; a.s9 *= b.s9; a.sa *= b.sa; a.sb *= b.sb; a.sc *= b.sc; a.sd *= b.sd; a.se *= b.se; a.sf *= b.sf; }
inline __device__ void operator >>= (u64x &a, const u64 b) { a.s0 >>= b; a.s1 >>= b; a.s2 >>= b; a.s3 >>= b; a.s4 >>= b; a.s5 >>= b; a.s6 >>= b; a.s7 >>= b; a.s8 >>= b; a.s9 >>= b; a.sa >>= b; a.sb >>= b; a.sc >>= b; a.sd >>= b; a.se >>= b; a.sf >>= b; }
inline __device__ void operator >>= (u64x &a, const u64x b) { a.s0 >>= b.s0; a.s1 >>= b.s1; a.s2 >>= b.s2; a.s3 >>= b.s3; a.s4 >>= b.s4; a.s5 >>= b.s5; a.s6 >>= b.s6; a.s7 >>= b.s7; a.s8 >>= b.s8; a.s9 >>= b.s9; a.sa >>= b.sa; a.sb >>= b.sb; a.sc >>= b.sc; a.sd >>= b.sd; a.se >>= b.se; a.sf >>= b.sf; }
inline __device__ void operator <<= (u64x &a, const u64 b) { a.s0 <<= b; a.s1 <<= b; a.s2 <<= b; a.s3 <<= b; a.s4 <<= b; a.s5 <<= b; a.s6 <<= b; a.s7 <<= b; a.s8 <<= b; a.s9 <<= b; a.sa <<= b; a.sb <<= b; a.sc <<= b; a.sd <<= b; a.se <<= b; a.sf <<= b; }
inline __device__ void operator <<= (u64x &a, const u64x b) { a.s0 <<= b.s0; a.s1 <<= b.s1; a.s2 <<= b.s2; a.s3 <<= b.s3; a.s4 <<= b.s4; a.s5 <<= b.s5; a.s6 <<= b.s6; a.s7 <<= b.s7; a.s8 <<= b.s8; a.s9 <<= b.s9; a.sa <<= b.sa; a.sb <<= b.sb; a.sc <<= b.sc; a.sd <<= b.sd; a.se <<= b.se; a.sf <<= b.sf; }
inline __device__ u64x operator << (const u64x a, const u64 b) { return u64x ((a.s0 << b), (a.s1 << b) , (a.s2 << b), (a.s3 << b) , (a.s4 << b), (a.s5 << b) , (a.s6 << b), (a.s7 << b), (a.s8 << b), (a.s9 << b) , (a.sa << b), (a.sb << b) , (a.sc << b), (a.sd << b) , (a.se << b), (a.sf << b) ); }
inline __device__ u64x operator << (const u64x a, const u64x b) { return u64x ((a.s0 << b.s0), (a.s1 << b.s1), (a.s2 << b.s2), (a.s3 << b.s3), (a.s4 << b.s4), (a.s5 << b.s5), (a.s6 << b.s6), (a.s7 << b.s7), (a.s8 << b.s8), (a.s9 << b.s9), (a.sa << b.sa), (a.sb << b.sb), (a.sc << b.sc), (a.sd << b.sd), (a.se << b.se), (a.sf << b.sf)); }
inline __device__ u64x operator >> (const u64x a, const u64 b) { return u64x ((a.s0 >> b), (a.s1 >> b) , (a.s2 >> b), (a.s3 >> b) , (a.s4 >> b), (a.s5 >> b) , (a.s6 >> b), (a.s7 >> b), (a.s8 >> b), (a.s9 >> b) , (a.sa >> b), (a.sb >> b) , (a.sc >> b), (a.sd >> b) , (a.se >> b), (a.sf >> b) ); }
inline __device__ u64x operator >> (const u64x a, const u64x b) { return u64x ((a.s0 >> b.s0), (a.s1 >> b.s1), (a.s2 >> b.s2), (a.s3 >> b.s3), (a.s4 >> b.s4), (a.s5 >> b.s5), (a.s6 >> b.s6), (a.s7 >> b.s7), (a.s8 >> b.s8), (a.s9 >> b.s9), (a.sa >> b.sa), (a.sb >> b.sb), (a.sc >> b.sc), (a.sd >> b.sd), (a.se >> b.se), (a.sf >> b.sf)); }
inline __device__ u64x operator ^ (const u64x a, const u64 b) { return u64x ((a.s0 ^ b), (a.s1 ^ b) , (a.s2 ^ b), (a.s3 ^ b) , (a.s4 ^ b), (a.s5 ^ b) , (a.s6 ^ b), (a.s7 ^ b), (a.s8 ^ b), (a.s9 ^ b) , (a.sa ^ b), (a.sb ^ b) , (a.sc ^ b), (a.sd ^ b) , (a.se ^ b), (a.sf ^ b) ); }
inline __device__ u64x operator ^ (const u64x a, const u64x b) { return u64x ((a.s0 ^ b.s0), (a.s1 ^ b.s1), (a.s2 ^ b.s2), (a.s3 ^ b.s3), (a.s4 ^ b.s4), (a.s5 ^ b.s5), (a.s6 ^ b.s6), (a.s7 ^ b.s7), (a.s8 ^ b.s8), (a.s9 ^ b.s9), (a.sa ^ b.sa), (a.sb ^ b.sb), (a.sc ^ b.sc), (a.sd ^ b.sd), (a.se ^ b.se), (a.sf ^ b.sf)); }
inline __device__ u64x operator | (const u64x a, const u64 b) { return u64x ((a.s0 | b), (a.s1 | b) , (a.s2 | b), (a.s3 | b) , (a.s4 | b), (a.s5 | b) , (a.s6 | b), (a.s7 | b), (a.s8 | b), (a.s9 | b) , (a.sa | b), (a.sb | b) , (a.sc | b), (a.sd | b) , (a.se | b), (a.sf | b) ); }
inline __device__ u64x operator | (const u64x a, const u64x b) { return u64x ((a.s0 | b.s0), (a.s1 | b.s1), (a.s2 | b.s2), (a.s3 | b.s3), (a.s4 | b.s4), (a.s5 | b.s5), (a.s6 | b.s6), (a.s7 | b.s7), (a.s8 | b.s8), (a.s9 | b.s9), (a.sa | b.sa), (a.sb | b.sb), (a.sc | b.sc), (a.sd | b.sd), (a.se | b.se), (a.sf | b.sf)); }
inline __device__ u64x operator & (const u64x a, const u64 b) { return u64x ((a.s0 & b), (a.s1 & b) , (a.s2 & b), (a.s3 & b) , (a.s4 & b), (a.s5 & b) , (a.s6 & b), (a.s7 & b), (a.s8 & b), (a.s9 & b) , (a.sa & b), (a.sb & b) , (a.sc & b), (a.sd & b) , (a.se & b), (a.sf & b) ); }
inline __device__ u64x operator & (const u64x a, const u64x b) { return u64x ((a.s0 & b.s0), (a.s1 & b.s1), (a.s2 & b.s2), (a.s3 & b.s3), (a.s4 & b.s4), (a.s5 & b.s5), (a.s6 & b.s6), (a.s7 & b.s7), (a.s8 & b.s8), (a.s9 & b.s9), (a.sa & b.sa), (a.sb & b.sb), (a.sc & b.sc), (a.sd & b.sd), (a.se & b.se), (a.sf & b.sf)); }
inline __device__ u64x operator + (const u64x a, const u64 b) { return u64x ((a.s0 + b), (a.s1 + b) , (a.s2 + b), (a.s3 + b) , (a.s4 + b), (a.s5 + b) , (a.s6 + b), (a.s7 + b), (a.s8 + b), (a.s9 + b) , (a.sa + b), (a.sb + b) , (a.sc + b), (a.sd + b) , (a.se + b), (a.sf + b) ); }
inline __device__ u64x operator + (const u64x a, const u64x b) { return u64x ((a.s0 + b.s0), (a.s1 + b.s1), (a.s2 + b.s2), (a.s3 + b.s3), (a.s4 + b.s4), (a.s5 + b.s5), (a.s6 + b.s6), (a.s7 + b.s7), (a.s8 + b.s8), (a.s9 + b.s9), (a.sa + b.sa), (a.sb + b.sb), (a.sc + b.sc), (a.sd + b.sd), (a.se + b.se), (a.sf + b.sf)); }
inline __device__ u64x operator - (const u64x a, const u64 b) { return u64x ((a.s0 - b), (a.s1 - b) , (a.s2 - b), (a.s3 - b) , (a.s4 - b), (a.s5 - b) , (a.s6 - b), (a.s7 - b), (a.s8 - b), (a.s9 - b) , (a.sa - b), (a.sb - b) , (a.sc - b), (a.sd - b) , (a.se - b), (a.sf - b) ); }
inline __device__ u64x operator - (const u64x a, const u64x b) { return u64x ((a.s0 - b.s0), (a.s1 - b.s1), (a.s2 - b.s2), (a.s3 - b.s3), (a.s4 - b.s4), (a.s5 - b.s5), (a.s6 - b.s6), (a.s7 - b.s7), (a.s8 - b.s8), (a.s9 - b.s9), (a.sa - b.sa), (a.sb - b.sb), (a.sc - b.sc), (a.sd - b.sd), (a.se - b.se), (a.sf - b.sf)); }
inline __device__ u64x operator * (const u64x a, const u64 b) { return u64x ((a.s0 * b), (a.s1 * b) , (a.s2 * b), (a.s3 * b) , (a.s4 * b), (a.s5 * b) , (a.s6 * b), (a.s7 * b), (a.s8 * b), (a.s9 * b) , (a.sa * b), (a.sb * b) , (a.sc * b), (a.sd * b) , (a.se * b), (a.sf * b) ); }
inline __device__ u64x operator * (const u64x a, const u64x b) { return u64x ((a.s0 * b.s0), (a.s1 * b.s1), (a.s2 * b.s2), (a.s3 * b.s3), (a.s4 * b.s4), (a.s5 * b.s5), (a.s6 * b.s6), (a.s7 * b.s7), (a.s8 * b.s8), (a.s9 * b.s9), (a.sa * b.sa), (a.sb * b.sb), (a.sc * b.sc), (a.sd * b.sd), (a.se * b.se), (a.sf * b.sf)); }
inline __device__ u64x operator % (const u64x a, const u64 b) { return u64x ((a.s0 % b), (a.s1 % b) , (a.s2 % b), (a.s3 % b) , (a.s4 % b), (a.s5 % b) , (a.s6 % b), (a.s7 % b), (a.s8 % b), (a.s9 % b) , (a.sa % b), (a.sb % b) , (a.sc % b), (a.sd % b) , (a.se % b), (a.sf % b) ); }
inline __device__ u64x operator % (const u64x a, const u64x b) { return u64x ((a.s0 % b.s0), (a.s1 % b.s1), (a.s2 % b.s2), (a.s3 % b.s3), (a.s4 % b.s4), (a.s5 % b.s5), (a.s6 % b.s6), (a.s7 % b.s7), (a.s8 % b.s8), (a.s9 % b.s9), (a.sa % b.sa), (a.sb % b.sb), (a.sc % b.sc), (a.sd % b.sd), (a.se % b.se), (a.sf % b.sf)); }
inline __device__ u64x operator ~ (const u64x a) { return u64x (~a.s0, ~a.s1, ~a.s2, ~a.s3, ~a.s4, ~a.s5, ~a.s6, ~a.s7, ~a.s8, ~a.s9, ~a.sa, ~a.sb, ~a.sc, ~a.sd, ~a.se, ~a.sf); }
#endif
typedef struct u8x u8x;
typedef struct u16x u16x;
typedef struct u32x u32x;
typedef struct u64x u64x;
#define make_u8x u8x
#define make_u16x u16x
#define make_u32x u32x
#define make_u64x u64x
#else
typedef VTYPE(uchar, VECT_SIZE) u8x;
typedef VTYPE(uchar, VECT_SIZE) u8x;
typedef VTYPE(ushort, VECT_SIZE) u16x;
typedef VTYPE(uint, VECT_SIZE) u32x;
typedef VTYPE(ulong, VECT_SIZE) u64x;
typedef VTYPE(ullong, VECT_SIZE) u64x;
#define make_u8x (u8x)
#define make_u16x (u16x)

@ -16,10 +16,6 @@
#define IS_OPENCL
#endif
#ifdef IS_HIP
#include <hip/hip_runtime.h>
#endif
#if defined IS_NATIVE
#define CONSTANT_VK
#define CONSTANT_AS

@ -73,18 +73,16 @@ enum{
MZ_VERSION_ERROR = -6,
MZ_PARAM_ERROR = -10000
};
typedef unsigned long mz_ulong;
typedef ullong mz_ulong;
#ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
typedef unsigned char Byte;
typedef unsigned int uInt;
typedef mz_ulong uLong;
typedef Byte Bytef;
typedef uInt uIntf;
typedef char charf;
typedef int intf;
typedef void *voidpf;
typedef uLong uLongf;
typedef void *voidp;
typedef void *const voidpc;
#define Z_NULL 0
@ -204,10 +202,6 @@ DECLSPEC void *memset(u8 *s, int c, u32 len){
#define MZ_MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MZ_DEFAULT_WINDOW_BITS 15
#define TINFL_LZ_DICT_SIZE 32768
#define TINFL_MEMCPY(d, s, l) memcpy(d, s, l)
#define TINFL_MEMCPY_G(d, s, l, p) memcpy_g(d, s, l, p)
#define TINFL_MEMSET(p, c, l) memset(p, c, (u32)l)
#define MZ_CLEAR_OBJ(obj) memset(&(obj), 0, sizeof(obj))
// hashcat-patched/hashcat-specific:
#ifdef CRC32_IN_INFLATE
@ -583,7 +577,7 @@ DECLSPEC tinfl_status tinfl_decompress(tinfl_decompressor *r, MAYBE_GLOBAL const
TINFL_CR_RETURN(38, (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) ? TINFL_STATUS_NEEDS_MORE_INPUT : TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS);
}
n = MZ_MIN(MZ_MIN((size_t)(pOut_buf_end - pOut_buf_cur), (size_t)(pIn_buf_end - pIn_buf_cur)), counter);
TINFL_MEMCPY_G(pOut_buf_cur, pIn_buf_cur, n, pStream);
memcpy_g(pOut_buf_cur, pIn_buf_cur, n, pStream);
pIn_buf_cur += n;
pOut_buf_cur += n;
counter -= (mz_uint)n;
@ -601,7 +595,7 @@ DECLSPEC tinfl_status tinfl_decompress(tinfl_decompressor *r, MAYBE_GLOBAL const
mz_uint i;
r->m_table_sizes[0] = 288;
r->m_table_sizes[1] = 32;
TINFL_MEMSET(r->m_tables[1].m_code_size, 5, 32);
memset(r->m_tables[1].m_code_size, 5, 32);
for (i = 0; i <= 143; ++i)
*p++ = 8;
for (; i <= 255; ++i)
@ -618,7 +612,8 @@ DECLSPEC tinfl_status tinfl_decompress(tinfl_decompressor *r, MAYBE_GLOBAL const
TINFL_GET_BITS(11, r->m_table_sizes[counter], "\05\05\04"[counter]);
r->m_table_sizes[counter] += s_min_table_sizes[counter];
}
MZ_CLEAR_OBJ(r->m_tables[2].m_code_size);
memset(r->m_tables[2].m_code_size, 0, TINFL_MAX_HUFF_SYMBOLS_0);
for (counter = 0; counter < r->m_table_sizes[2]; counter++)
{
mz_uint s;
@ -633,9 +628,11 @@ DECLSPEC tinfl_status tinfl_decompress(tinfl_decompressor *r, MAYBE_GLOBAL const
tinfl_huff_table *pTable;
mz_uint i, j, used_syms, total, sym_index, next_code[17], total_syms[16];
pTable = &r->m_tables[r->m_type];
MZ_CLEAR_OBJ(total_syms);
MZ_CLEAR_OBJ(pTable->m_look_up);
MZ_CLEAR_OBJ(pTable->m_tree);
memset((u8 *) total_syms, 0, 64);
memset((u8 *) pTable->m_look_up, 0, TINFL_FAST_LOOKUP_SIZE * 2);
memset((u8 *) pTable->m_tree, 0, TINFL_MAX_HUFF_SYMBOLS_0 * 2 * 2);
for (i = 0; i < r->m_table_sizes[r->m_type]; ++i)
total_syms[pTable->m_code_size[i]]++;
used_syms = 0, total = 0;
@ -707,15 +704,18 @@ DECLSPEC tinfl_status tinfl_decompress(tinfl_decompressor *r, MAYBE_GLOBAL const
num_extra = "\02\03\07"[dist - 16];
TINFL_GET_BITS(18, s, num_extra);
s += "\03\03\013"[dist - 16];
TINFL_MEMSET(r->m_len_codes + counter, (dist == 16) ? r->m_len_codes[counter - 1] : 0, s);
memset(r->m_len_codes + counter, (dist == 16) ? r->m_len_codes[counter - 1] : 0, s);
counter += s;
}
if ((r->m_table_sizes[0] + r->m_table_sizes[1]) != counter)
{
TINFL_CR_RETURN_FOREVER(21, TINFL_STATUS_FAILED);
}
TINFL_MEMCPY(r->m_tables[0].m_code_size, r->m_len_codes, r->m_table_sizes[0]);
TINFL_MEMCPY(r->m_tables[1].m_code_size, r->m_len_codes + r->m_table_sizes[0], r->m_table_sizes[1]);
memcpy(r->m_tables[0].m_code_size, r->m_len_codes, r->m_table_sizes[0]);
memcpy(r->m_tables[1].m_code_size, r->m_len_codes + r->m_table_sizes[0], r->m_table_sizes[1]);
}
}
for (;;)

@ -32,7 +32,7 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 tmp3;
u32 tmp4;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
@ -45,12 +45,18 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
tmp4 = hc_bytealign (in3, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];
@ -139,7 +145,7 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 tmp3;
u32 tmp4;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
@ -153,12 +159,18 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
tmp4 = hc_bytealign (in3, in4, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];
@ -246,7 +258,7 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 tmp1;
u32 tmp2;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
@ -255,12 +267,18 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
tmp2 = hc_bytealign (in1, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];

@ -1664,18 +1664,18 @@ DECLSPEC void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32
DECLSPEC void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63)
{
const u32 s001 = (0x001 & SALT) ? 0xffffffff : 0;
const u32 s002 = (0x002 & SALT) ? 0xffffffff : 0;
const u32 s004 = (0x004 & SALT) ? 0xffffffff : 0;
const u32 s008 = (0x008 & SALT) ? 0xffffffff : 0;
const u32 s010 = (0x010 & SALT) ? 0xffffffff : 0;
const u32 s020 = (0x020 & SALT) ? 0xffffffff : 0;
const u32 s040 = (0x040 & SALT) ? 0xffffffff : 0;
const u32 s080 = (0x080 & SALT) ? 0xffffffff : 0;
const u32 s100 = (0x100 & SALT) ? 0xffffffff : 0;
const u32 s200 = (0x200 & SALT) ? 0xffffffff : 0;
const u32 s400 = (0x400 & SALT) ? 0xffffffff : 0;
const u32 s800 = (0x800 & SALT) ? 0xffffffff : 0;
const u32 s001 = (0x001 & SALT) ? 1 : 0;
const u32 s002 = (0x002 & SALT) ? 1 : 0;
const u32 s004 = (0x004 & SALT) ? 1 : 0;
const u32 s008 = (0x008 & SALT) ? 1 : 0;
const u32 s010 = (0x010 & SALT) ? 1 : 0;
const u32 s020 = (0x020 & SALT) ? 1 : 0;
const u32 s040 = (0x040 & SALT) ? 1 : 0;
const u32 s080 = (0x080 & SALT) ? 1 : 0;
const u32 s100 = (0x100 & SALT) ? 1 : 0;
const u32 s200 = (0x200 & SALT) ? 1 : 0;
const u32 s400 = (0x400 & SALT) ? 1 : 0;
const u32 s800 = (0x800 & SALT) ? 1 : 0;
KXX_DECL u32 k00, k01, k02, k03, k04, k05;
KXX_DECL u32 k06, k07, k08, k09, k10, k11;

@ -31,7 +31,7 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 tmp3;
u32 tmp4;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
@ -44,12 +44,18 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
tmp4 = hc_bytealign (in3, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];
@ -138,7 +144,7 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 tmp3;
u32 tmp4;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
@ -152,12 +158,18 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
tmp4 = hc_bytealign (in3, in4, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];
@ -245,7 +257,7 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 tmp1;
u32 tmp2;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
@ -254,12 +266,18 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
tmp2 = hc_bytealign (in1, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];

@ -775,11 +775,7 @@ KERNEL_FQ void m02500_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pbkdf2_tmp_t, wpa_eapol_t)
s_te4[i] = te4[i];
}
#if defined IS_CUDA || defined IS_HIP
__syncthreads();
#else
SYNC_THREADS ();
#endif
#else

@ -2119,7 +2119,7 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
u32 tmp4;
u32 tmp5;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
@ -2134,12 +2134,18 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
tmp5 = hc_bytealign (in4, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];

@ -28,7 +28,7 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 tmp3;
u32 tmp4;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
@ -41,12 +41,18 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
tmp4 = hc_bytealign (in3, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];
@ -135,7 +141,7 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 tmp3;
u32 tmp4;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
@ -149,12 +155,18 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
tmp4 = hc_bytealign (in3, in4, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];
@ -242,7 +254,7 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 tmp1;
u32 tmp2;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
u32 in0 = append[0];
u32 in1 = append[1];
@ -251,12 +263,18 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
tmp2 = hc_bytealign (in1, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
const int offset_mod_4 = offset & 3;
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
#endif
u32 in0 = append[0];
u32 in1 = append[1];

@ -45,7 +45,7 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3
u32 in2 = append[2];
u32 in3 = append[3];
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
const u32 tmp0 = hc_bytealign_be ( 0, in0, offset);
const u32 tmp1 = hc_bytealign_be (in0, in1, offset);
const u32 tmp2 = hc_bytealign_be (in1, in2, offset);
@ -53,8 +53,15 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3
const u32 tmp4 = hc_bytealign_be (in3, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
const u32 tmp0 = hc_byte_perm_S (in0, 0, selector);
const u32 tmp1 = hc_byte_perm_S (in1, in0, selector);
@ -165,7 +172,7 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u
u32 in2 = append[2];
u32 in3 = append[3];
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
const u32 tmp0 = hc_bytealign_be ( 0, in0, offset);
const u32 tmp1 = hc_bytealign_be (in0, in1, offset);
const u32 tmp2 = hc_bytealign_be (in1, in2, offset);
@ -173,8 +180,15 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u
const u32 tmp4 = hc_bytealign_be (in3, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
const u32 tmp0 = hc_byte_perm_S (in0, 0, selector);
const u32 tmp1 = hc_byte_perm_S (in1, in0, selector);
@ -322,7 +336,7 @@ DECLSPEC u32 memcat16s (u32 *block, const u32 offset, const u32 *append, const u
u32 in3 = append[3];
u32 in4 = append[4];
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
const u32 tmp0 = hc_bytealign_be ( 0, in0, offset);
const u32 tmp1 = hc_bytealign_be (in0, in1, offset);
const u32 tmp2 = hc_bytealign_be (in1, in2, offset);
@ -331,8 +345,15 @@ DECLSPEC u32 memcat16s (u32 *block, const u32 offset, const u32 *append, const u
const u32 tmp5 = hc_bytealign_be (in4, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
const u32 tmp0 = hc_byte_perm_S (in0, 0, selector);
const u32 tmp1 = hc_byte_perm_S (in1, in0, selector);
@ -456,7 +477,7 @@ DECLSPEC u32 memcat16sc (u32 *block, const u32 offset, const u32 *append, const
u32 in3 = append[3];
u32 in4 = append[4];
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
const u32 tmp0 = hc_bytealign_be ( 0, in0, offset);
const u32 tmp1 = hc_bytealign_be (in0, in1, offset);
const u32 tmp2 = hc_bytealign_be (in1, in2, offset);
@ -465,8 +486,15 @@ DECLSPEC u32 memcat16sc (u32 *block, const u32 offset, const u32 *append, const
const u32 tmp5 = hc_bytealign_be (in4, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
const u32 tmp0 = hc_byte_perm_S (in0, 0, selector);
const u32 tmp1 = hc_byte_perm_S (in1, in0, selector);
@ -756,7 +784,7 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3
u32 in2 = append[2];
u32 in3 = append[3];
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset);
const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset);
const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset);
@ -764,8 +792,15 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3
const u32 tmp4 = hc_bytealign_be_S (in3, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
const u32 tmp0 = hc_byte_perm_S (in0, 0, selector);
const u32 tmp1 = hc_byte_perm_S (in1, in0, selector);
@ -915,7 +950,7 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons
u32 in3 = append[3];
u32 in4 = 0x80000000;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset);
const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset);
const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset);
@ -923,8 +958,15 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons
const u32 tmp4 = hc_bytealign_be_S (in3, in4, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
const u32 tmp0 = hc_byte_perm_S (in0, 0, selector);
const u32 tmp1 = hc_byte_perm_S (in1, in0, selector);
@ -1074,7 +1116,7 @@ DECLSPEC u32 memcat24 (u32 *block, const u32 offset, const u32 *append, const u3
u32 in3 = append[3];
u32 in4 = append[4];
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset);
const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset);
const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset);
@ -1083,8 +1125,15 @@ DECLSPEC u32 memcat24 (u32 *block, const u32 offset, const u32 *append, const u3
const u32 tmp5 = hc_bytealign_be_S (in4, 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
const u32 tmp0 = hc_byte_perm_S (in0, 0, selector);
const u32 tmp1 = hc_byte_perm_S (in1, in0, selector);

@ -303,7 +303,7 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t))
digest[6] = sha256_hmac_ctx2.opad.h[6];
digest[7] = sha256_hmac_ctx2.opad.h[7];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
#else
@ -331,7 +331,7 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t))
uint4 X[4];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
X[0] = make_uint4 (T[0].x, T[1].y, T[2].z, T[3].w);
X[1] = make_uint4 (T[1].x, T[2].y, T[3].z, T[0].w);
X[2] = make_uint4 (T[2].x, T[3].y, T[0].z, T[1].w);
@ -441,7 +441,7 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
uint4 T[4];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
T[0] = make_uint4 (X[0].x, X[3].y, X[2].z, X[1].w);
T[1] = make_uint4 (X[1].x, X[0].y, X[3].z, X[2].w);
T[2] = make_uint4 (X[2].x, X[1].y, X[0].z, X[3].w);

@ -310,6 +310,51 @@ CONSTANT_VK u32a c_pbox[18] =
0x9216d5d9, 0x8979fb1b
};
// Yes, works only with CUDA atm
#ifdef DYNAMIC_LOCAL
#define BCRYPT_AVOID_BANK_CONFLICTS
#endif
#ifdef BCRYPT_AVOID_BANK_CONFLICTS
// access pattern: minimize bank ID based on thread ID but thread ID is not saved from computation
#define KEY32(lid,key) (((key) * FIXED_LOCAL_SIZE) + (lid))
DECLSPEC u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key)
{
const u64 lid = get_local_id (0);
return S[KEY32 (lid, key)];
}
DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val)
{
const u64 lid = get_local_id (0);
S[KEY32 (lid, key)] = val;
}
#undef KEY32
#else
// access pattern: linear access with S offset already set to right offset based on thread ID saving it from compuation
// makes sense if there are not thread ID's (for instance on CPU)
DECLSPEC inline u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key)
{
return S[key];
}
DECLSPEC inline void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val)
{
S[key] = val;
}
#endif
#define BF_ROUND(L,R,N) \
{ \
u32 tmp; \
@ -319,10 +364,10 @@ CONSTANT_VK u32a c_pbox[18] =
const u32 r2 = unpack_v8b_from_v32_S ((L)); \
const u32 r3 = unpack_v8a_from_v32_S ((L)); \
\
tmp = S0[r0]; \
tmp += S1[r1]; \
tmp ^= S2[r2]; \
tmp += S3[r3]; \
tmp = GET_KEY32 (S0, r0); \
tmp += GET_KEY32 (S1, r1); \
tmp ^= GET_KEY32 (S2, r2); \
tmp += GET_KEY32 (S3, r3); \
\
(R) ^= tmp ^ P[(N)]; \
}
@ -357,6 +402,10 @@ CONSTANT_VK u32a c_pbox[18] =
L ^= P[17]; \
}
#ifdef DYNAMIC_LOCAL
extern __shared__ u32 S[];
#endif
KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_init (KERN_ATTR_TMPS (pwsafe2_tmp_t))
{
/**
@ -471,22 +520,33 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_init (KERN_ATTR_TMPS
P[i] = c_pbox[i];
}
#ifdef DYNAMIC_LOCAL
// from host
#else
LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
#endif
#ifdef BCRYPT_AVOID_BANK_CONFLICTS
LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0);
LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1);
LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2);
LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3);
#else
LOCAL_AS u32 *S0 = S0_all[lid];
LOCAL_AS u32 *S1 = S1_all[lid];
LOCAL_AS u32 *S2 = S2_all[lid];
LOCAL_AS u32 *S3 = S3_all[lid];
#endif
for (u32 i = 0; i < 256; i++)
{
S0[i] = c_sbox0[i];
S1[i] = c_sbox1[i];
S2[i] = c_sbox2[i];
S3[i] = c_sbox3[i];
SET_KEY32 (S0, i, c_sbox0[i]);
SET_KEY32 (S1, i, c_sbox1[i]);
SET_KEY32 (S2, i, c_sbox2[i]);
SET_KEY32 (S3, i, c_sbox3[i]);
}
for (u32 i = 0; i < 18; i++)
@ -509,59 +569,59 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_init (KERN_ATTR_TMPS
{
BF_ENCRYPT (L0, R0);
S0[i + 0] = L0;
S0[i + 1] = R0;
SET_KEY32 (S0, i + 0, L0);
SET_KEY32 (S0, i + 1, R0);
BF_ENCRYPT (L0, R0);
S0[i + 2] = L0;
S0[i + 3] = R0;
SET_KEY32 (S0, i + 2, L0);
SET_KEY32 (S0, i + 3, R0);
}
for (u32 i = 0; i < 256; i += 4)
{
BF_ENCRYPT (L0, R0);
S1[i + 0] = L0;
S1[i + 1] = R0;
SET_KEY32 (S1, i + 0, L0);
SET_KEY32 (S1, i + 1, R0);
BF_ENCRYPT (L0, R0);
S1[i + 2] = L0;
S1[i + 3] = R0;
SET_KEY32 (S1, i + 2, L0);
SET_KEY32 (S1, i + 3, R0);
}
for (u32 i = 0; i < 256; i += 4)
{
BF_ENCRYPT (L0, R0);
S2[i + 0] = L0;
S2[i + 1] = R0;
SET_KEY32 (S2, i + 0, L0);
SET_KEY32 (S2, i + 1, R0);
BF_ENCRYPT (L0, R0);
S2[i + 2] = L0;
S2[i + 3] = R0;
SET_KEY32 (S2, i + 2, L0);
SET_KEY32 (S2, i + 3, R0);
}
for (u32 i = 0; i < 256; i += 4)
{
BF_ENCRYPT (L0, R0);
S3[i + 0] = L0;
S3[i + 1] = R0;
SET_KEY32 (S3, i + 0, L0);
SET_KEY32 (S3, i + 1, R0);
BF_ENCRYPT (L0, R0);
S3[i + 2] = L0;
S3[i + 3] = R0;
SET_KEY32 (S3, i + 2, L0);
SET_KEY32 (S3, i + 3, R0);
}
// store
tmps[gid].digest[0] = salt_buf[0];
tmps[gid].digest[1] = salt_buf[1];
// store
for (u32 i = 0; i < 18; i++)
{
tmps[gid].P[i] = P[i];
@ -569,10 +629,10 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_init (KERN_ATTR_TMPS
for (u32 i = 0; i < 256; i++)
{
tmps[gid].S0[i] = S0[i];
tmps[gid].S1[i] = S1[i];
tmps[gid].S2[i] = S2[i];
tmps[gid].S3[i] = S3[i];
tmps[gid].S0[i] = GET_KEY32 (S0, i);
tmps[gid].S1[i] = GET_KEY32 (S1, i);
tmps[gid].S2[i] = GET_KEY32 (S2, i);
tmps[gid].S3[i] = GET_KEY32 (S3, i);
}
}
@ -602,22 +662,33 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_loop (KERN_ATTR_TMPS
P[i] = tmps[gid].P[i];
}
#ifdef DYNAMIC_LOCAL
// from host
#else
LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
#endif
#ifdef BCRYPT_AVOID_BANK_CONFLICTS
LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE * 256 * 0);
LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE * 256 * 1);
LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE * 256 * 2);
LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE * 256 * 3);
#else
LOCAL_AS u32 *S0 = S0_all[lid];
LOCAL_AS u32 *S1 = S1_all[lid];
LOCAL_AS u32 *S2 = S2_all[lid];
LOCAL_AS u32 *S3 = S3_all[lid];
#endif
for (u32 i = 0; i < 256; i++)
{
S0[i] = tmps[gid].S0[i];
S1[i] = tmps[gid].S1[i];
S2[i] = tmps[gid].S2[i];
S3[i] = tmps[gid].S3[i];
SET_KEY32 (S0, i, tmps[gid].S0[i]);
SET_KEY32 (S1, i, tmps[gid].S1[i]);
SET_KEY32 (S2, i, tmps[gid].S2[i]);
SET_KEY32 (S3, i, tmps[gid].S3[i]);
}
// loop
@ -630,8 +701,6 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m09000_loop (KERN_ATTR_TMPS
BF_ENCRYPT (L0, R0);
}
// store
tmps[gid].digest[0] = L0;
tmps[gid].digest[1] = R0;
}

@ -232,7 +232,7 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl,
u32 i;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
for (i = 0; i < pd; i++) sc[idx++] = pw[i];
sc[idx++] = pw[i]
| hc_bytealign_be (bl[0], 0, pm4);
@ -242,8 +242,15 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl,
sc[idx++] = hc_bytealign_be ( 0, sc[i - 1], pm4);
#endif
#ifdef IS_NV
int selector = (0x76543210 >> (pm4 * 4)) & 0xffff;
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((pm4 & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((pm4 & 3) * 8));
#endif
for (i = 0; i < pd; i++) sc[idx++] = pw[i];
sc[idx++] = pw[i]
@ -263,16 +270,22 @@ DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, con
const u32 om = m % 4;
const u32 od = m / 4;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
pt[0] = hc_bytealign_be (sc[od + 1], sc[od + 0], om);
pt[1] = hc_bytealign_be (sc[od + 2], sc[od + 1], om);
pt[2] = hc_bytealign_be (sc[od + 3], sc[od + 2], om);
pt[3] = hc_bytealign_be (sc[od + 4], sc[od + 3], om);
#endif
#ifdef IS_NV
int selector = (0x76543210 >> (om * 4)) & 0xffff;
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((om & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((om & 3) * 8));
#endif
pt[0] = hc_byte_perm (sc[od + 0], sc[od + 1], selector);
pt[1] = hc_byte_perm (sc[od + 1], sc[od + 2], selector);
pt[2] = hc_byte_perm (sc[od + 2], sc[od + 3], selector);

@ -42,13 +42,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
u32 tmp0;
u32 tmp1;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
tmp0 = hc_bytealign_be (0, append, func_len);
tmp1 = hc_bytealign_be (append, 0, func_len);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8));
#endif
tmp0 = hc_byte_perm (append, 0, selector);
tmp1 = hc_byte_perm (0, append, selector);

@ -37,13 +37,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
u32 tmp0;
u32 tmp1;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
tmp0 = hc_bytealign_be (0, append, func_len);
tmp1 = hc_bytealign_be (append, 0, func_len);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8));
#endif
tmp0 = hc_byte_perm (append, 0, selector);
tmp1 = hc_byte_perm (0, append, selector);
@ -797,7 +804,7 @@ KERNEL_FQ void m12500_loop (KERN_ATTR_TMPS (rar3_tmp_t))
* base
*/
const u32 pw_len = pws[gid].pw_len & 255;
const u32 pw_len = pws[gid].pw_len;
const u32 salt_len = 8;
@ -963,7 +970,7 @@ KERNEL_FQ void m12500_comp (KERN_ATTR_TMPS (rar3_tmp_t))
* base
*/
const u32 pw_len = pws[gid].pw_len & 255;
const u32 pw_len = pws[gid].pw_len;
const u32 salt_len = 8;

@ -37,7 +37,7 @@ typedef struct zip2
u32 verify_bytes;
u32 compress_length;
u32 data_len;
u32 data_buf[0x4000000];
u32 data_buf[0x200000];
u32 auth_len;
u32 auth_buf[4];

@ -366,7 +366,9 @@ KERNEL_FQ void m13711_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 3, out[3]);
unpackv (tmps, pim_key, gid, i + 4, out[4]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -417,7 +417,9 @@ KERNEL_FQ void m13712_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 3, out[3]);
unpackv (tmps, pim_key, gid, i + 4, out[4]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -482,7 +482,9 @@ KERNEL_FQ void m13713_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 3, out[3]);
unpackv (tmps, pim_key, gid, i + 4, out[4]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -458,7 +458,9 @@ KERNEL_FQ void m13721_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
unpack64v (tmps, pim_key, gid, i + 6, out[6]);
unpack64v (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -509,7 +509,9 @@ KERNEL_FQ void m13722_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
unpack64v (tmps, pim_key, gid, i + 6, out[6]);
unpack64v (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -574,7 +574,9 @@ KERNEL_FQ void m13723_loop (KERN_ATTR_TMPS_ESALT (vc64_tmp_t, vc_t))
unpack64v (tmps, pim_key, gid, i + 6, out[6]);
unpack64v (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -622,7 +622,9 @@ KERNEL_FQ void m13731_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 14, out[14]);
unpackv (tmps, pim_key, gid, i + 15, out[15]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -673,7 +673,9 @@ KERNEL_FQ void m13732_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 14, out[14]);
unpackv (tmps, pim_key, gid, i + 15, out[15]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -738,7 +738,9 @@ KERNEL_FQ void m13733_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 14, out[14]);
unpackv (tmps, pim_key, gid, i + 15, out[15]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -435,7 +435,9 @@ KERNEL_FQ void m13751_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 6, out[6]);
unpackv (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -457,7 +457,9 @@ KERNEL_FQ void m13752_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 6, out[6]);
unpackv (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -522,7 +522,9 @@ KERNEL_FQ void m13753_loop (KERN_ATTR_TMPS_ESALT (vc_tmp_t, vc_t))
unpackv (tmps, pim_key, gid, i + 6, out[6]);
unpackv (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -537,7 +537,9 @@ KERNEL_FQ void m13771_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
unpack64v (tmps, pim_key, gid, i + 6, out[6]);
unpack64v (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -588,7 +588,9 @@ KERNEL_FQ void m13772_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
unpack64v (tmps, pim_key, gid, i + 6, out[6]);
unpack64v (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -653,7 +653,9 @@ KERNEL_FQ void m13773_loop (KERN_ATTR_TMPS_ESALT (vc64_sbog_tmp_t, vc_t))
unpack64v (tmps, pim_key, gid, i + 6, out[6]);
unpack64v (tmps, pim_key, gid, i + 7, out[7]);
tmps[gid].pim_check = pim;
const u32x pimx = make_u32x (pim);
unpack (tmps, pim_check, gid, pimx);
}
}

@ -51,7 +51,7 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp15;
u32x tmp16;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
tmp00 = hc_bytealign_be ( 0, carry[ 0], offset);
tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset);
@ -71,8 +71,15 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
tmp16 = hc_bytealign_be (carry[15], 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
tmp00 = hc_byte_perm (carry[ 0], 0, selector);
tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector);

@ -49,7 +49,7 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp15;
u32x tmp16;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
tmp00 = hc_bytealign_be ( 0, carry[ 0], offset);
tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset);
@ -69,8 +69,15 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
tmp16 = hc_bytealign_be (carry[15], 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
tmp00 = hc_byte_perm (carry[ 0], 0, selector);
tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector);

@ -48,7 +48,7 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp15;
u32x tmp16;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
tmp00 = hc_bytealign_be ( 0, carry[ 0], offset);
tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset);
@ -68,8 +68,15 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
tmp16 = hc_bytealign_be (carry[15], 0, offset);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
#endif
tmp00 = hc_byte_perm (carry[ 0], 0, selector);
tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector);

@ -439,7 +439,7 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
digest[6] = sha256_hmac_ctx2.opad.h[6];
digest[7] = sha256_hmac_ctx2.opad.h[7];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
#else
@ -467,7 +467,7 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
uint4 X[4];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
X[0] = make_uint4 (T[0].x, T[1].y, T[2].z, T[3].w);
X[1] = make_uint4 (T[1].x, T[2].y, T[3].z, T[0].w);
X[2] = make_uint4 (T[2].x, T[3].y, T[0].z, T[1].w);
@ -577,7 +577,7 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_
uint4 T[4];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
T[0] = make_uint4 (X[0].x, X[3].y, X[2].z, X[1].w);
T[1] = make_uint4 (X[1].x, X[0].y, X[3].z, X[2].w);
T[2] = make_uint4 (X[2].x, X[1].y, X[0].z, X[3].w);

@ -319,6 +319,51 @@ CONSTANT_VK u32a c_pbox[18] =
0x9216d5d9, 0x8979fb1b
};
// Yes, works only with CUDA atm
#ifdef DYNAMIC_LOCAL
#define BCRYPT_AVOID_BANK_CONFLICTS
#endif
#ifdef BCRYPT_AVOID_BANK_CONFLICTS
// access pattern: minimize bank ID based on thread ID but thread ID is not saved from computation
#define KEY32(lid,key) (((key) * FIXED_LOCAL_SIZE_COMP) + (lid))
DECLSPEC u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key)
{
const u64 lid = get_local_id (0);
return S[KEY32 (lid, key)];
}
DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val)
{
const u64 lid = get_local_id (0);
S[KEY32 (lid, key)] = val;
}
#undef KEY32
#else
// access pattern: linear access with S offset already set to right offset based on thread ID saving it from compuation
// makes sense if there are not thread ID's (for instance on CPU)
DECLSPEC inline u32 GET_KEY32 (LOCAL_AS u32 *S, const u64 key)
{
return S[key];
}
DECLSPEC inline void SET_KEY32 (LOCAL_AS u32 *S, const u64 key, const u32 val)
{
S[key] = val;
}
#endif
#define BF_ROUND(L,R,N) \
{ \
u32 tmp; \
@ -328,10 +373,10 @@ CONSTANT_VK u32a c_pbox[18] =
const u32 r2 = unpack_v8b_from_v32_S ((L)); \
const u32 r3 = unpack_v8a_from_v32_S ((L)); \
\
tmp = S0[r0]; \
tmp += S1[r1]; \
tmp ^= S2[r2]; \
tmp += S3[r3]; \
tmp = GET_KEY32 (S0, r0); \
tmp += GET_KEY32 (S1, r1); \
tmp ^= GET_KEY32 (S2, r2); \
tmp += GET_KEY32 (S3, r3); \
\
(R) ^= tmp ^ P[(N)]; \
}
@ -366,6 +411,10 @@ CONSTANT_VK u32a c_pbox[18] =
L ^= P[17]; \
}
#ifdef DYNAMIC_LOCAL
extern __shared__ u32 S[];
#endif
DECLSPEC void hmac_sha1_run_V (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *ipad, u32x *opad, u32x *digest)
{
digest[0] = ipad[0];
@ -586,7 +635,7 @@ KERNEL_FQ void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
}
}
KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
{
const u64 gid = get_global_id (0);
const u64 lid = get_local_id (0);
@ -616,22 +665,33 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_
P[i] = c_pbox[i] ^ ukey[i % 4];
}
LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE][256];
LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE][256];
#ifdef DYNAMIC_LOCAL
// from host
#else
LOCAL_VK u32 S0_all[FIXED_LOCAL_SIZE_COMP][256];
LOCAL_VK u32 S1_all[FIXED_LOCAL_SIZE_COMP][256];
LOCAL_VK u32 S2_all[FIXED_LOCAL_SIZE_COMP][256];
LOCAL_VK u32 S3_all[FIXED_LOCAL_SIZE_COMP][256];
#endif
#ifdef BCRYPT_AVOID_BANK_CONFLICTS
LOCAL_AS u32 *S0 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 0);
LOCAL_AS u32 *S1 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 1);
LOCAL_AS u32 *S2 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 2);
LOCAL_AS u32 *S3 = S + (FIXED_LOCAL_SIZE_COMP * 256 * 3);
#else
LOCAL_AS u32 *S0 = S0_all[lid];
LOCAL_AS u32 *S1 = S1_all[lid];
LOCAL_AS u32 *S2 = S2_all[lid];
LOCAL_AS u32 *S3 = S3_all[lid];
#endif
for (u32 i = 0; i < 256; i++)
{
S0[i] = c_sbox0[i];
S1[i] = c_sbox1[i];
S2[i] = c_sbox2[i];
S3[i] = c_sbox3[i];
SET_KEY32 (S0, i, c_sbox0[i]);
SET_KEY32 (S1, i, c_sbox1[i]);
SET_KEY32 (S2, i, c_sbox2[i]);
SET_KEY32 (S3, i, c_sbox3[i]);
}
u32 L0 = 0;
@ -649,52 +709,52 @@ KERNEL_FQ void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE) m18600_comp (KERN_ATTR_TMPS_
{
BF_ENCRYPT (L0, R0);
S0[i + 0] = L0;
S0[i + 1] = R0;
SET_KEY32 (S0, i + 0, L0);
SET_KEY32 (S0, i + 1, R0);
BF_ENCRYPT (L0, R0);
S0[i + 2] = L0;
S0[i + 3] = R0;
SET_KEY32 (S0, i + 2, L0);
SET_KEY32 (S0, i + 3, R0);
}
for (u32 i = 0; i < 256; i += 4)
{
BF_ENCRYPT (L0, R0);
S1[i + 0] = L0;
S1[i + 1] = R0;
SET_KEY32 (S1, i + 0, L0);
SET_KEY32 (S1, i + 1, R0);
BF_ENCRYPT (L0, R0);
S1[i + 2] = L0;
S1[i + 3] = R0;
SET_KEY32 (S1, i + 2, L0);
SET_KEY32 (S1, i + 3, R0);
}
for (u32 i = 0; i < 256; i += 4)
{
BF_ENCRYPT (L0, R0);
S2[i + 0] = L0;
S2[i + 1] = R0;
SET_KEY32 (S2, i + 0, L0);
SET_KEY32 (S2, i + 1, R0);
BF_ENCRYPT (L0, R0);
S2[i + 2] = L0;
S2[i + 3] = R0;
SET_KEY32 (S2, i + 2, L0);
SET_KEY32 (S2, i + 3, R0);
}
for (u32 i = 0; i < 256; i += 4)
{
BF_ENCRYPT (L0, R0);
S3[i + 0] = L0;
S3[i + 1] = R0;
SET_KEY32 (S3, i + 0, L0);
SET_KEY32 (S3, i + 1, R0);
BF_ENCRYPT (L0, R0);
S3[i + 2] = L0;
S3[i + 3] = R0;
SET_KEY32 (S3, i + 2, L0);
SET_KEY32 (S3, i + 3, R0);
}
GLOBAL_AS const odf11_t *es = &esalt_bufs[DIGESTS_OFFSET];

@ -797,11 +797,7 @@ KERNEL_FQ void m22000_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pbkdf2_tmp_t, wpa_t))
s_te4[i] = te4[i];
}
#if defined IS_CUDA || defined IS_HIP
__syncthreads();
#else
SYNC_THREADS ();
#endif
#else

@ -610,11 +610,7 @@ KERNEL_FQ void m22001_aux3 (KERN_ATTR_TMPS_ESALT (wpa_pmk_tmp_t, wpa_t))
s_te4[i] = te4[i];
}
#if defined IS_CUDA || defined IS_HIP
__syncthreads();
#else
SYNC_THREADS ();
#endif
#else

@ -374,7 +374,7 @@ KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t))
digest[6] = sha256_hmac_ctx2.opad.h[6];
digest[7] = sha256_hmac_ctx2.opad.h[7];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]);
const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]);
#else
@ -402,7 +402,7 @@ KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t))
uint4 X[4];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
X[0] = make_uint4 (T[0].x, T[1].y, T[2].z, T[3].w);
X[1] = make_uint4 (T[1].x, T[2].y, T[3].z, T[0].w);
X[2] = make_uint4 (T[2].x, T[3].y, T[0].z, T[1].w);
@ -575,7 +575,7 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t))
uint4 T[4];
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
T[0] = make_uint4 (X[0].x, X[3].y, X[2].z, X[1].w);
T[1] = make_uint4 (X[1].x, X[0].y, X[3].z, X[2].w);
T[2] = make_uint4 (X[2].x, X[1].y, X[0].z, X[3].w);

@ -145,13 +145,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
u32 tmp0;
u32 tmp1;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
tmp0 = hc_bytealign_be (0, append, func_len);
tmp1 = hc_bytealign_be (append, 0, func_len);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8));
#endif
tmp0 = hc_byte_perm (append, 0, selector);
tmp1 = hc_byte_perm (0, append, selector);
@ -905,7 +912,7 @@ KERNEL_FQ void m23700_loop (KERN_ATTR_TMPS_ESALT (rar3_tmp_t, rar3_t))
* base
*/
const u32 pw_len = pws[gid].pw_len & 255;
const u32 pw_len = pws[gid].pw_len;
const u32 salt_len = 8;
@ -1079,7 +1086,7 @@ KERNEL_FQ void m23700_comp (KERN_ATTR_TMPS_ESALT (rar3_tmp_t, rar3_t))
* base
*/
const u32 pw_len = pws[gid].pw_len & 255;
const u32 pw_len = pws[gid].pw_len;
const u32 salt_len = 8;

@ -56,13 +56,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
u32 tmp0;
u32 tmp1;
#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
tmp0 = hc_bytealign_be (0, append, func_len);
tmp1 = hc_bytealign_be (append, 0, func_len);
#endif
#ifdef IS_NV
#if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff;
#endif
#if (defined IS_AMD || defined IS_HIP)
const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8));
#endif
tmp0 = hc_byte_perm (append, 0, selector);
tmp1 = hc_byte_perm (0, append, selector);
@ -816,7 +823,7 @@ KERNEL_FQ void m23800_loop (KERN_ATTR_TMPS_HOOKS_ESALT (rar3_tmp_t, rar3_hook_t,
* base
*/
const u32 pw_len = pws[gid].pw_len & 255;
const u32 pw_len = pws[gid].pw_len;
const u32 salt_len = 8;
@ -983,7 +990,7 @@ KERNEL_FQ void m23800_hook23 (KERN_ATTR_TMPS_HOOKS_ESALT (rar3_tmp_t, rar3_hook_
* base
*/
const u32 pw_len = pws[gid].pw_len & 255;
const u32 pw_len = pws[gid].pw_len;
const u32 salt_len = 8;

@ -0,0 +1,592 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_md5.cl"
#include "inc_hash_sha1.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 34
#define SNMPV3_MSG_AUTH_PARAMS_LEN 12
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 64
#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32)
#define SNMPV3_HASH_ELEMS_MD5 4
#define SNMPV3_HASH_ELEMS_SHA1 8 // 8 = aligned 5
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64
#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
#define SNMPV3_MAX_PW_LENGTH_OPT 64
#define SNMPV3_TMP_ELEMS_OPT ((SNMPV3_MAX_PW_LENGTH_OPT * SNMPV3_MAX_PW_LENGTH) / 4)
// (64 * 64) / 4 = 1024
// for pw length > 64 we use global memory reads
typedef struct hmac_md5_tmp
{
u32 tmp_md5[SNMPV3_TMP_ELEMS];
u32 tmp_sha1[SNMPV3_TMP_ELEMS];
u32 h_md5[SNMPV3_HASH_ELEMS_MD5];
u32 h_sha1[SNMPV3_HASH_ELEMS_SHA1];
} hmac_md5_tmp_t;
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
KERNEL_FQ void m25000_init (KERN_ATTR_TMPS_ESALT (hmac_md5_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32 w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u8 *src_ptr = (u8 *) w;
// password 64 times, also swapped
u32 dst_buf[16];
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 64; i++)
{
for (int j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 63;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 63)
{
const int tmp_idx4 = (tmp_idx - 63) / 4;
// md5
tmps[gid].tmp_md5[tmp_idx4 + 0] = dst_buf[ 0];
tmps[gid].tmp_md5[tmp_idx4 + 1] = dst_buf[ 1];
tmps[gid].tmp_md5[tmp_idx4 + 2] = dst_buf[ 2];
tmps[gid].tmp_md5[tmp_idx4 + 3] = dst_buf[ 3];
tmps[gid].tmp_md5[tmp_idx4 + 4] = dst_buf[ 4];
tmps[gid].tmp_md5[tmp_idx4 + 5] = dst_buf[ 5];
tmps[gid].tmp_md5[tmp_idx4 + 6] = dst_buf[ 6];
tmps[gid].tmp_md5[tmp_idx4 + 7] = dst_buf[ 7];
tmps[gid].tmp_md5[tmp_idx4 + 8] = dst_buf[ 8];
tmps[gid].tmp_md5[tmp_idx4 + 9] = dst_buf[ 9];
tmps[gid].tmp_md5[tmp_idx4 + 10] = dst_buf[10];
tmps[gid].tmp_md5[tmp_idx4 + 11] = dst_buf[11];
tmps[gid].tmp_md5[tmp_idx4 + 12] = dst_buf[12];
tmps[gid].tmp_md5[tmp_idx4 + 13] = dst_buf[13];
tmps[gid].tmp_md5[tmp_idx4 + 14] = dst_buf[14];
tmps[gid].tmp_md5[tmp_idx4 + 15] = dst_buf[15];
// sha1
tmps[gid].tmp_sha1[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]);
tmps[gid].tmp_sha1[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]);
tmps[gid].tmp_sha1[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]);
tmps[gid].tmp_sha1[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]);
tmps[gid].tmp_sha1[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]);
tmps[gid].tmp_sha1[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]);
tmps[gid].tmp_sha1[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]);
tmps[gid].tmp_sha1[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]);
tmps[gid].tmp_sha1[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]);
tmps[gid].tmp_sha1[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]);
tmps[gid].tmp_sha1[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]);
tmps[gid].tmp_sha1[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]);
tmps[gid].tmp_sha1[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]);
tmps[gid].tmp_sha1[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]);
tmps[gid].tmp_sha1[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]);
tmps[gid].tmp_sha1[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]);
}
tmp_idx++;
}
}
// hash md5
tmps[gid].h_md5[0] = MD5M_A;
tmps[gid].h_md5[1] = MD5M_B;
tmps[gid].h_md5[2] = MD5M_C;
tmps[gid].h_md5[3] = MD5M_D;
// hash sha1
tmps[gid].h_sha1[0] = SHA1M_A;
tmps[gid].h_sha1[1] = SHA1M_B;
tmps[gid].h_sha1[2] = SHA1M_C;
tmps[gid].h_sha1[3] = SHA1M_D;
tmps[gid].h_sha1[4] = SHA1M_E;
}
KERNEL_FQ void m25000_loop (KERN_ATTR_TMPS_ESALT (hmac_md5_tmp_t, snmpv3_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 h_md5[4];
h_md5[0] = tmps[gid].h_md5[0];
h_md5[1] = tmps[gid].h_md5[1];
h_md5[2] = tmps[gid].h_md5[2];
h_md5[3] = tmps[gid].h_md5[3];
u32 h_sha1[5];
h_sha1[0] = tmps[gid].h_sha1[0];
h_sha1[1] = tmps[gid].h_sha1[1];
h_sha1[2] = tmps[gid].h_sha1[2];
h_sha1[3] = tmps[gid].h_sha1[3];
h_sha1[4] = tmps[gid].h_sha1[4];
const u32 pw_len = pws[gid].pw_len;
const int pw_len64 = pw_len * 64;
if (pw_len <= SNMPV3_MAX_PW_LENGTH_OPT)
{
u32 tmp_shared[SNMPV3_TMP_ELEMS_OPT];
// md5
for (int i = 0; i < pw_len64 / 4; i++)
{
tmp_shared[i] = tmps[gid].tmp_md5[i];
}
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmp_shared[idx + 0];
w0[1] = tmp_shared[idx + 1];
w0[2] = tmp_shared[idx + 2];
w0[3] = tmp_shared[idx + 3];
w1[0] = tmp_shared[idx + 4];
w1[1] = tmp_shared[idx + 5];
w1[2] = tmp_shared[idx + 6];
w1[3] = tmp_shared[idx + 7];
w2[0] = tmp_shared[idx + 8];
w2[1] = tmp_shared[idx + 9];
w2[2] = tmp_shared[idx + 10];
w2[3] = tmp_shared[idx + 11];
w3[0] = tmp_shared[idx + 12];
w3[1] = tmp_shared[idx + 13];
w3[2] = tmp_shared[idx + 14];
w3[3] = tmp_shared[idx + 15];
md5_transform (w0, w1, w2, w3, h_md5);
}
// sha1
for (int i = 0; i < pw_len64 / 4; i++)
{
tmp_shared[i] = tmps[gid].tmp_sha1[i];
}
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmp_shared[idx + 0];
w0[1] = tmp_shared[idx + 1];
w0[2] = tmp_shared[idx + 2];
w0[3] = tmp_shared[idx + 3];
w1[0] = tmp_shared[idx + 4];
w1[1] = tmp_shared[idx + 5];
w1[2] = tmp_shared[idx + 6];
w1[3] = tmp_shared[idx + 7];
w2[0] = tmp_shared[idx + 8];
w2[1] = tmp_shared[idx + 9];
w2[2] = tmp_shared[idx + 10];
w2[3] = tmp_shared[idx + 11];
w3[0] = tmp_shared[idx + 12];
w3[1] = tmp_shared[idx + 13];
w3[2] = tmp_shared[idx + 14];
w3[3] = tmp_shared[idx + 15];
sha1_transform (w0, w1, w2, w3, h_sha1);
}
}
else
{
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
// md5
w0[0] = tmps[gid].tmp_md5[idx + 0];
w0[1] = tmps[gid].tmp_md5[idx + 1];
w0[2] = tmps[gid].tmp_md5[idx + 2];
w0[3] = tmps[gid].tmp_md5[idx + 3];
w1[0] = tmps[gid].tmp_md5[idx + 4];
w1[1] = tmps[gid].tmp_md5[idx + 5];
w1[2] = tmps[gid].tmp_md5[idx + 6];
w1[3] = tmps[gid].tmp_md5[idx + 7];
w2[0] = tmps[gid].tmp_md5[idx + 8];
w2[1] = tmps[gid].tmp_md5[idx + 9];
w2[2] = tmps[gid].tmp_md5[idx + 10];
w2[3] = tmps[gid].tmp_md5[idx + 11];
w3[0] = tmps[gid].tmp_md5[idx + 12];
w3[1] = tmps[gid].tmp_md5[idx + 13];
w3[2] = tmps[gid].tmp_md5[idx + 14];
w3[3] = tmps[gid].tmp_md5[idx + 15];
md5_transform (w0, w1, w2, w3, h_md5);
// sha1
w0[0] = tmps[gid].tmp_sha1[idx + 0];
w0[1] = tmps[gid].tmp_sha1[idx + 1];
w0[2] = tmps[gid].tmp_sha1[idx + 2];
w0[3] = tmps[gid].tmp_sha1[idx + 3];
w1[0] = tmps[gid].tmp_sha1[idx + 4];
w1[1] = tmps[gid].tmp_sha1[idx + 5];
w1[2] = tmps[gid].tmp_sha1[idx + 6];
w1[3] = tmps[gid].tmp_sha1[idx + 7];
w2[0] = tmps[gid].tmp_sha1[idx + 8];
w2[1] = tmps[gid].tmp_sha1[idx + 9];
w2[2] = tmps[gid].tmp_sha1[idx + 10];
w2[3] = tmps[gid].tmp_sha1[idx + 11];
w3[0] = tmps[gid].tmp_sha1[idx + 12];
w3[1] = tmps[gid].tmp_sha1[idx + 13];
w3[2] = tmps[gid].tmp_sha1[idx + 14];
w3[3] = tmps[gid].tmp_sha1[idx + 15];
sha1_transform (w0, w1, w2, w3, h_sha1);
}
}
tmps[gid].h_md5[0] = h_md5[0];
tmps[gid].h_md5[1] = h_md5[1];
tmps[gid].h_md5[2] = h_md5[2];
tmps[gid].h_md5[3] = h_md5[3];
tmps[gid].h_sha1[0] = h_sha1[0];
tmps[gid].h_sha1[1] = h_sha1[1];
tmps[gid].h_sha1[2] = h_sha1[2];
tmps[gid].h_sha1[3] = h_sha1[3];
tmps[gid].h_sha1[4] = h_sha1[4];
}
KERNEL_FQ void m25000_comp (KERN_ATTR_TMPS_ESALT (hmac_md5_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
// md5
w0[0] = 0x00000080;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 1048576 * 8;
w3[3] = 0;
u32 h_md5[4];
h_md5[0] = tmps[gid].h_md5[0];
h_md5[1] = tmps[gid].h_md5[1];
h_md5[2] = tmps[gid].h_md5[2];
h_md5[3] = tmps[gid].h_md5[3];
md5_transform (w0, w1, w2, w3, h_md5);
// sha1
w0[0] = 0x80000000;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 1048576 * 8;
u32 h_sha1[5];
h_sha1[0] = tmps[gid].h_sha1[0];
h_sha1[1] = tmps[gid].h_sha1[1];
h_sha1[2] = tmps[gid].h_sha1[2];
h_sha1[3] = tmps[gid].h_sha1[3];
h_sha1[4] = tmps[gid].h_sha1[4];
sha1_transform (w0, w1, w2, w3, h_sha1);
md5_ctx_t md5_ctx;
sha1_ctx_t sha1_ctx;
md5_init (&md5_ctx);
sha1_init (&sha1_ctx);
u32 w[16];
// md5
w[ 0] = h_md5[0];
w[ 1] = h_md5[1];
w[ 2] = h_md5[2];
w[ 3] = h_md5[3];
w[ 4] = 0;
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
md5_update (&md5_ctx, w, 16);
// sha1
w[ 0] = h_sha1[0];
w[ 1] = h_sha1[1];
w[ 2] = h_sha1[2];
w[ 3] = h_sha1[3];
w[ 4] = h_sha1[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_update (&sha1_ctx, w, 20);
// engineID
md5_update_global (&md5_ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
sha1_update_global_swap (&sha1_ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
// md5
w[ 0] = h_md5[0];
w[ 1] = h_md5[1];
w[ 2] = h_md5[2];
w[ 3] = h_md5[3];
w[ 4] = 0;
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
md5_update (&md5_ctx, w, 16);
// sha1
w[ 0] = h_sha1[0];
w[ 1] = h_sha1[1];
w[ 2] = h_sha1[2];
w[ 3] = h_sha1[3];
w[ 4] = h_sha1[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_update (&sha1_ctx, w, 20);
md5_final (&md5_ctx);
sha1_final (&sha1_ctx);
// md5
w[ 0] = md5_ctx.h[0];
w[ 1] = md5_ctx.h[1];
w[ 2] = md5_ctx.h[2];
w[ 3] = md5_ctx.h[3];
w[ 4] = 0;
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
md5_hmac_ctx_t md5_hmac_ctx;
md5_hmac_init (&md5_hmac_ctx, w, 16);
md5_hmac_update_global (&md5_hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
md5_hmac_final (&md5_hmac_ctx);
{
const u32 r0 = hc_swap32_S (md5_hmac_ctx.opad.h[DGST_R0]);
const u32 r1 = hc_swap32_S (md5_hmac_ctx.opad.h[DGST_R1]);
const u32 r2 = hc_swap32_S (md5_hmac_ctx.opad.h[DGST_R2]);
const u32 r3 = 0;
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}
// sha1
w[ 0] = sha1_ctx.h[0];
w[ 1] = sha1_ctx.h[1];
w[ 2] = sha1_ctx.h[2];
w[ 3] = sha1_ctx.h[3];
w[ 4] = sha1_ctx.h[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_hmac_ctx_t sha1_hmac_ctx;
sha1_hmac_init (&sha1_hmac_ctx, w, 20);
sha1_hmac_update_global_swap (&sha1_hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha1_hmac_final (&sha1_hmac_ctx);
{
const u32 r0 = sha1_hmac_ctx.opad.h[DGST_R0];
const u32 r1 = sha1_hmac_ctx.opad.h[DGST_R1];
const u32 r2 = sha1_hmac_ctx.opad.h[DGST_R2];
const u32 r3 = 0;
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}
}

@ -0,0 +1,355 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_md5.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 34
#define SNMPV3_MSG_AUTH_PARAMS_LEN 12
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 64
#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 4
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64
#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
#define SNMPV3_MAX_PW_LENGTH_OPT 64
#define SNMPV3_TMP_ELEMS_OPT ((SNMPV3_MAX_PW_LENGTH_OPT * SNMPV3_MAX_PW_LENGTH) / 4)
// (64 * 64) / 4 = 1024
// for pw length > 64 we use global memory reads
typedef struct hmac_md5_tmp
{
u32 tmp[SNMPV3_TMP_ELEMS];
u32 h[SNMPV3_HASH_ELEMS];
} hmac_md5_tmp_t;
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
KERNEL_FQ void m25100_init (KERN_ATTR_TMPS_ESALT (hmac_md5_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32 w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u8 *src_ptr = (u8 *) w;
// password 64 times, also swapped
u32 dst_buf[16];
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 64; i++)
{
for (int j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 63;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 63)
{
const int tmp_idx4 = (tmp_idx - 63) / 4;
tmps[gid].tmp[tmp_idx4 + 0] = dst_buf[ 0];
tmps[gid].tmp[tmp_idx4 + 1] = dst_buf[ 1];
tmps[gid].tmp[tmp_idx4 + 2] = dst_buf[ 2];
tmps[gid].tmp[tmp_idx4 + 3] = dst_buf[ 3];
tmps[gid].tmp[tmp_idx4 + 4] = dst_buf[ 4];
tmps[gid].tmp[tmp_idx4 + 5] = dst_buf[ 5];
tmps[gid].tmp[tmp_idx4 + 6] = dst_buf[ 6];
tmps[gid].tmp[tmp_idx4 + 7] = dst_buf[ 7];
tmps[gid].tmp[tmp_idx4 + 8] = dst_buf[ 8];
tmps[gid].tmp[tmp_idx4 + 9] = dst_buf[ 9];
tmps[gid].tmp[tmp_idx4 + 10] = dst_buf[10];
tmps[gid].tmp[tmp_idx4 + 11] = dst_buf[11];
tmps[gid].tmp[tmp_idx4 + 12] = dst_buf[12];
tmps[gid].tmp[tmp_idx4 + 13] = dst_buf[13];
tmps[gid].tmp[tmp_idx4 + 14] = dst_buf[14];
tmps[gid].tmp[tmp_idx4 + 15] = dst_buf[15];
}
tmp_idx++;
}
}
// hash
tmps[gid].h[0] = MD5M_A;
tmps[gid].h[1] = MD5M_B;
tmps[gid].h[2] = MD5M_C;
tmps[gid].h[3] = MD5M_D;
}
KERNEL_FQ void m25100_loop (KERN_ATTR_TMPS_ESALT (hmac_md5_tmp_t, snmpv3_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 h[4];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
const u32 pw_len = pws[gid].pw_len;
const int pw_len64 = pw_len * 64;
if (pw_len <= SNMPV3_MAX_PW_LENGTH_OPT)
{
u32 tmp[SNMPV3_TMP_ELEMS_OPT];
for (int i = 0; i < pw_len64 / 4; i++)
{
tmp[i] = tmps[gid].tmp[i];
}
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmp[idx + 0];
w0[1] = tmp[idx + 1];
w0[2] = tmp[idx + 2];
w0[3] = tmp[idx + 3];
w1[0] = tmp[idx + 4];
w1[1] = tmp[idx + 5];
w1[2] = tmp[idx + 6];
w1[3] = tmp[idx + 7];
w2[0] = tmp[idx + 8];
w2[1] = tmp[idx + 9];
w2[2] = tmp[idx + 10];
w2[3] = tmp[idx + 11];
w3[0] = tmp[idx + 12];
w3[1] = tmp[idx + 13];
w3[2] = tmp[idx + 14];
w3[3] = tmp[idx + 15];
md5_transform (w0, w1, w2, w3, h);
}
}
else
{
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmps[gid].tmp[idx + 0];
w0[1] = tmps[gid].tmp[idx + 1];
w0[2] = tmps[gid].tmp[idx + 2];
w0[3] = tmps[gid].tmp[idx + 3];
w1[0] = tmps[gid].tmp[idx + 4];
w1[1] = tmps[gid].tmp[idx + 5];
w1[2] = tmps[gid].tmp[idx + 6];
w1[3] = tmps[gid].tmp[idx + 7];
w2[0] = tmps[gid].tmp[idx + 8];
w2[1] = tmps[gid].tmp[idx + 9];
w2[2] = tmps[gid].tmp[idx + 10];
w2[3] = tmps[gid].tmp[idx + 11];
w3[0] = tmps[gid].tmp[idx + 12];
w3[1] = tmps[gid].tmp[idx + 13];
w3[2] = tmps[gid].tmp[idx + 14];
w3[3] = tmps[gid].tmp[idx + 15];
md5_transform (w0, w1, w2, w3, h);
}
}
tmps[gid].h[0] = h[0];
tmps[gid].h[1] = h[1];
tmps[gid].h[2] = h[2];
tmps[gid].h[3] = h[3];
}
KERNEL_FQ void m25100_comp (KERN_ATTR_TMPS_ESALT (hmac_md5_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = 0x00000080;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 1048576 * 8;
w3[3] = 0;
u32 h[4];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
md5_transform (w0, w1, w2, w3, h);
md5_ctx_t ctx;
md5_init (&ctx);
u32 w[16];
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = 0;
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
md5_update (&ctx, w, 16);
md5_update_global (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = 0;
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
md5_update (&ctx, w, 16);
md5_final (&ctx);
w[ 0] = ctx.h[0];
w[ 1] = ctx.h[1];
w[ 2] = ctx.h[2];
w[ 3] = ctx.h[3];
w[ 4] = 0;
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
md5_hmac_ctx_t hmac_ctx;
md5_hmac_init (&hmac_ctx, w, 16);
md5_hmac_update_global (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
md5_hmac_final (&hmac_ctx);
const u32 r0 = hmac_ctx.opad.h[DGST_R0];
const u32 r1 = hmac_ctx.opad.h[DGST_R1];
const u32 r2 = hmac_ctx.opad.h[DGST_R2];
const u32 r3 = 0;
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,360 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha1.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 34
#define SNMPV3_MSG_AUTH_PARAMS_LEN 12
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 64
#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 8 // 8 = aligned 5
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64
#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
#define SNMPV3_MAX_PW_LENGTH_OPT 64
#define SNMPV3_TMP_ELEMS_OPT ((SNMPV3_MAX_PW_LENGTH_OPT * SNMPV3_MAX_PW_LENGTH) / 4)
// (64 * 64) / 4 = 1024
// for pw length > 64 we use global memory reads
typedef struct hmac_sha1_tmp
{
u32 tmp[SNMPV3_TMP_ELEMS];
u32 h[SNMPV3_HASH_ELEMS];
} hmac_sha1_tmp_t;
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
KERNEL_FQ void m25200_init (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32 w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u8 *src_ptr = (u8 *) w;
// password 64 times, also swapped
u32 dst_buf[16];
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 64; i++)
{
for (int j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 63;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 63)
{
const int tmp_idx4 = (tmp_idx - 63) / 4;
tmps[gid].tmp[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]);
tmps[gid].tmp[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]);
tmps[gid].tmp[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]);
tmps[gid].tmp[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]);
tmps[gid].tmp[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]);
tmps[gid].tmp[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]);
tmps[gid].tmp[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]);
tmps[gid].tmp[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]);
tmps[gid].tmp[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]);
tmps[gid].tmp[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]);
tmps[gid].tmp[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]);
tmps[gid].tmp[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]);
tmps[gid].tmp[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]);
tmps[gid].tmp[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]);
tmps[gid].tmp[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]);
tmps[gid].tmp[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]);
}
tmp_idx++;
}
}
// hash
tmps[gid].h[0] = SHA1M_A;
tmps[gid].h[1] = SHA1M_B;
tmps[gid].h[2] = SHA1M_C;
tmps[gid].h[3] = SHA1M_D;
tmps[gid].h[4] = SHA1M_E;
}
KERNEL_FQ void m25200_loop (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 h[5];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
const u32 pw_len = pws[gid].pw_len;
const int pw_len64 = pw_len * 64;
if (pw_len <= SNMPV3_MAX_PW_LENGTH_OPT)
{
u32 tmp[SNMPV3_TMP_ELEMS_OPT];
for (int i = 0; i < pw_len64 / 4; i++)
{
tmp[i] = tmps[gid].tmp[i];
}
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmp[idx + 0];
w0[1] = tmp[idx + 1];
w0[2] = tmp[idx + 2];
w0[3] = tmp[idx + 3];
w1[0] = tmp[idx + 4];
w1[1] = tmp[idx + 5];
w1[2] = tmp[idx + 6];
w1[3] = tmp[idx + 7];
w2[0] = tmp[idx + 8];
w2[1] = tmp[idx + 9];
w2[2] = tmp[idx + 10];
w2[3] = tmp[idx + 11];
w3[0] = tmp[idx + 12];
w3[1] = tmp[idx + 13];
w3[2] = tmp[idx + 14];
w3[3] = tmp[idx + 15];
sha1_transform (w0, w1, w2, w3, h);
}
}
else
{
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmps[gid].tmp[idx + 0];
w0[1] = tmps[gid].tmp[idx + 1];
w0[2] = tmps[gid].tmp[idx + 2];
w0[3] = tmps[gid].tmp[idx + 3];
w1[0] = tmps[gid].tmp[idx + 4];
w1[1] = tmps[gid].tmp[idx + 5];
w1[2] = tmps[gid].tmp[idx + 6];
w1[3] = tmps[gid].tmp[idx + 7];
w2[0] = tmps[gid].tmp[idx + 8];
w2[1] = tmps[gid].tmp[idx + 9];
w2[2] = tmps[gid].tmp[idx + 10];
w2[3] = tmps[gid].tmp[idx + 11];
w3[0] = tmps[gid].tmp[idx + 12];
w3[1] = tmps[gid].tmp[idx + 13];
w3[2] = tmps[gid].tmp[idx + 14];
w3[3] = tmps[gid].tmp[idx + 15];
sha1_transform (w0, w1, w2, w3, h);
}
}
tmps[gid].h[0] = h[0];
tmps[gid].h[1] = h[1];
tmps[gid].h[2] = h[2];
tmps[gid].h[3] = h[3];
tmps[gid].h[4] = h[4];
}
KERNEL_FQ void m25200_comp (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = 0x80000000;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 1048576 * 8;
u32 h[5];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
sha1_transform (w0, w1, w2, w3, h);
sha1_ctx_t ctx;
sha1_init (&ctx);
u32 w[16];
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_update (&ctx, w, 20);
sha1_update_global_swap (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_update (&ctx, w, 20);
sha1_final (&ctx);
w[ 0] = ctx.h[0];
w[ 1] = ctx.h[1];
w[ 2] = ctx.h[2];
w[ 3] = ctx.h[3];
w[ 4] = ctx.h[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_hmac_ctx_t hmac_ctx;
sha1_hmac_init (&hmac_ctx, w, 20);
sha1_hmac_update_global_swap (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha1_hmac_final (&hmac_ctx);
const u32 r0 = hmac_ctx.opad.h[DGST_R0];
const u32 r1 = hmac_ctx.opad.h[DGST_R1];
const u32 r2 = hmac_ctx.opad.h[DGST_R2];
const u32 r3 = 0;
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,371 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha224.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 34
#define SNMPV3_MSG_AUTH_PARAMS_MAX 16
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 64
#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 8
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64
#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
#define SNMPV3_MAX_PW_LENGTH_OPT 64
#define SNMPV3_TMP_ELEMS_OPT ((SNMPV3_MAX_PW_LENGTH_OPT * SNMPV3_MAX_PW_LENGTH) / 4)
// (64 * 64) / 4 = 1024
// for pw length > 64 we use global memory reads
typedef struct hmac_sha224_tmp
{
u32 tmp[SNMPV3_TMP_ELEMS];
u32 h[SNMPV3_HASH_ELEMS];
} hmac_sha224_tmp_t;
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
KERNEL_FQ void m26700_init (KERN_ATTR_TMPS_ESALT (hmac_sha224_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32 w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u8 *src_ptr = (u8 *) w;
// password 64 times, also swapped
u32 dst_buf[16];
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 64; i++)
{
for (int j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 63;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 63)
{
const int tmp_idx4 = (tmp_idx - 63) / 4;
tmps[gid].tmp[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]);
tmps[gid].tmp[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]);
tmps[gid].tmp[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]);
tmps[gid].tmp[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]);
tmps[gid].tmp[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]);
tmps[gid].tmp[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]);
tmps[gid].tmp[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]);
tmps[gid].tmp[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]);
tmps[gid].tmp[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]);
tmps[gid].tmp[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]);
tmps[gid].tmp[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]);
tmps[gid].tmp[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]);
tmps[gid].tmp[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]);
tmps[gid].tmp[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]);
tmps[gid].tmp[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]);
tmps[gid].tmp[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]);
}
tmp_idx++;
}
}
// hash
tmps[gid].h[0] = SHA224M_A;
tmps[gid].h[1] = SHA224M_B;
tmps[gid].h[2] = SHA224M_C;
tmps[gid].h[3] = SHA224M_D;
tmps[gid].h[4] = SHA224M_E;
tmps[gid].h[5] = SHA224M_F;
tmps[gid].h[6] = SHA224M_G;
tmps[gid].h[7] = SHA224M_H;
}
KERNEL_FQ void m26700_loop (KERN_ATTR_TMPS_ESALT (hmac_sha224_tmp_t, snmpv3_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
const u32 pw_len = pws[gid].pw_len;
const int pw_len64 = pw_len * 64;
if (pw_len <= SNMPV3_MAX_PW_LENGTH_OPT)
{
u32 tmp[SNMPV3_TMP_ELEMS_OPT];
for (int i = 0; i < pw_len64 / 4; i++)
{
tmp[i] = tmps[gid].tmp[i];
}
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmp[idx + 0];
w0[1] = tmp[idx + 1];
w0[2] = tmp[idx + 2];
w0[3] = tmp[idx + 3];
w1[0] = tmp[idx + 4];
w1[1] = tmp[idx + 5];
w1[2] = tmp[idx + 6];
w1[3] = tmp[idx + 7];
w2[0] = tmp[idx + 8];
w2[1] = tmp[idx + 9];
w2[2] = tmp[idx + 10];
w2[3] = tmp[idx + 11];
w3[0] = tmp[idx + 12];
w3[1] = tmp[idx + 13];
w3[2] = tmp[idx + 14];
w3[3] = tmp[idx + 15];
sha224_transform (w0, w1, w2, w3, h);
}
}
else
{
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmps[gid].tmp[idx + 0];
w0[1] = tmps[gid].tmp[idx + 1];
w0[2] = tmps[gid].tmp[idx + 2];
w0[3] = tmps[gid].tmp[idx + 3];
w1[0] = tmps[gid].tmp[idx + 4];
w1[1] = tmps[gid].tmp[idx + 5];
w1[2] = tmps[gid].tmp[idx + 6];
w1[3] = tmps[gid].tmp[idx + 7];
w2[0] = tmps[gid].tmp[idx + 8];
w2[1] = tmps[gid].tmp[idx + 9];
w2[2] = tmps[gid].tmp[idx + 10];
w2[3] = tmps[gid].tmp[idx + 11];
w3[0] = tmps[gid].tmp[idx + 12];
w3[1] = tmps[gid].tmp[idx + 13];
w3[2] = tmps[gid].tmp[idx + 14];
w3[3] = tmps[gid].tmp[idx + 15];
sha224_transform (w0, w1, w2, w3, h);
}
}
tmps[gid].h[0] = h[0];
tmps[gid].h[1] = h[1];
tmps[gid].h[2] = h[2];
tmps[gid].h[3] = h[3];
tmps[gid].h[4] = h[4];
tmps[gid].h[5] = h[5];
tmps[gid].h[6] = h[6];
tmps[gid].h[7] = h[7];
}
KERNEL_FQ void m26700_comp (KERN_ATTR_TMPS_ESALT (hmac_sha224_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = 0x80000000;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 1048576 * 8;
u32 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
sha224_transform (w0, w1, w2, w3, h);
sha224_ctx_t ctx;
sha224_init (&ctx);
u32 w[16];
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = h[5];
w[ 6] = h[6];
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha224_update (&ctx, w, 28);
sha224_update_global_swap (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = h[5];
w[ 6] = h[6];
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha224_update (&ctx, w, 28);
sha224_final (&ctx);
w[ 0] = ctx.h[0];
w[ 1] = ctx.h[1];
w[ 2] = ctx.h[2];
w[ 3] = ctx.h[3];
w[ 4] = ctx.h[4];
w[ 5] = ctx.h[5];
w[ 6] = ctx.h[6];
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha224_hmac_ctx_t hmac_ctx;
sha224_hmac_init (&hmac_ctx, w, 28);
sha224_hmac_update_global_swap (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha224_hmac_final (&hmac_ctx);
const u32 r0 = hmac_ctx.opad.h[DGST_R0];
const u32 r1 = hmac_ctx.opad.h[DGST_R1];
const u32 r2 = hmac_ctx.opad.h[DGST_R2];
const u32 r3 = hmac_ctx.opad.h[DGST_R3];
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,371 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha256.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 34
#define SNMPV3_MSG_AUTH_PARAMS_MAX 24
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 64
#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 8
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64
#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
#define SNMPV3_MAX_PW_LENGTH_OPT 64
#define SNMPV3_TMP_ELEMS_OPT ((SNMPV3_MAX_PW_LENGTH_OPT * SNMPV3_MAX_PW_LENGTH) / 4)
// (64 * 64) / 4 = 1024
// for pw length > 64 we use global memory reads
typedef struct hmac_sha256_tmp
{
u32 tmp[SNMPV3_TMP_ELEMS];
u32 h[SNMPV3_HASH_ELEMS];
} hmac_sha256_tmp_t;
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
KERNEL_FQ void m26800_init (KERN_ATTR_TMPS_ESALT (hmac_sha256_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32 w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u8 *src_ptr = (u8 *) w;
// password 64 times, also swapped
u32 dst_buf[16];
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 64; i++)
{
for (int j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 63;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 63)
{
const int tmp_idx4 = (tmp_idx - 63) / 4;
tmps[gid].tmp[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]);
tmps[gid].tmp[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]);
tmps[gid].tmp[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]);
tmps[gid].tmp[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]);
tmps[gid].tmp[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]);
tmps[gid].tmp[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]);
tmps[gid].tmp[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]);
tmps[gid].tmp[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]);
tmps[gid].tmp[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]);
tmps[gid].tmp[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]);
tmps[gid].tmp[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]);
tmps[gid].tmp[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]);
tmps[gid].tmp[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]);
tmps[gid].tmp[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]);
tmps[gid].tmp[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]);
tmps[gid].tmp[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]);
}
tmp_idx++;
}
}
// hash
tmps[gid].h[0] = SHA256M_A;
tmps[gid].h[1] = SHA256M_B;
tmps[gid].h[2] = SHA256M_C;
tmps[gid].h[3] = SHA256M_D;
tmps[gid].h[4] = SHA256M_E;
tmps[gid].h[5] = SHA256M_F;
tmps[gid].h[6] = SHA256M_G;
tmps[gid].h[7] = SHA256M_H;
}
KERNEL_FQ void m26800_loop (KERN_ATTR_TMPS_ESALT (hmac_sha256_tmp_t, snmpv3_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
const u32 pw_len = pws[gid].pw_len;
const int pw_len64 = pw_len * 64;
if (pw_len <= SNMPV3_MAX_PW_LENGTH_OPT)
{
u32 tmp[SNMPV3_TMP_ELEMS_OPT];
for (int i = 0; i < pw_len64 / 4; i++)
{
tmp[i] = tmps[gid].tmp[i];
}
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmp[idx + 0];
w0[1] = tmp[idx + 1];
w0[2] = tmp[idx + 2];
w0[3] = tmp[idx + 3];
w1[0] = tmp[idx + 4];
w1[1] = tmp[idx + 5];
w1[2] = tmp[idx + 6];
w1[3] = tmp[idx + 7];
w2[0] = tmp[idx + 8];
w2[1] = tmp[idx + 9];
w2[2] = tmp[idx + 10];
w2[3] = tmp[idx + 11];
w3[0] = tmp[idx + 12];
w3[1] = tmp[idx + 13];
w3[2] = tmp[idx + 14];
w3[3] = tmp[idx + 15];
sha256_transform (w0, w1, w2, w3, h);
}
}
else
{
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmps[gid].tmp[idx + 0];
w0[1] = tmps[gid].tmp[idx + 1];
w0[2] = tmps[gid].tmp[idx + 2];
w0[3] = tmps[gid].tmp[idx + 3];
w1[0] = tmps[gid].tmp[idx + 4];
w1[1] = tmps[gid].tmp[idx + 5];
w1[2] = tmps[gid].tmp[idx + 6];
w1[3] = tmps[gid].tmp[idx + 7];
w2[0] = tmps[gid].tmp[idx + 8];
w2[1] = tmps[gid].tmp[idx + 9];
w2[2] = tmps[gid].tmp[idx + 10];
w2[3] = tmps[gid].tmp[idx + 11];
w3[0] = tmps[gid].tmp[idx + 12];
w3[1] = tmps[gid].tmp[idx + 13];
w3[2] = tmps[gid].tmp[idx + 14];
w3[3] = tmps[gid].tmp[idx + 15];
sha256_transform (w0, w1, w2, w3, h);
}
}
tmps[gid].h[0] = h[0];
tmps[gid].h[1] = h[1];
tmps[gid].h[2] = h[2];
tmps[gid].h[3] = h[3];
tmps[gid].h[4] = h[4];
tmps[gid].h[5] = h[5];
tmps[gid].h[6] = h[6];
tmps[gid].h[7] = h[7];
}
KERNEL_FQ void m26800_comp (KERN_ATTR_TMPS_ESALT (hmac_sha256_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = 0x80000000;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 1048576 * 8;
u32 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
sha256_transform (w0, w1, w2, w3, h);
sha256_ctx_t ctx;
sha256_init (&ctx);
u32 w[16];
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = h[5];
w[ 6] = h[6];
w[ 7] = h[7];
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha256_update (&ctx, w, 32);
sha256_update_global_swap (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = h[5];
w[ 6] = h[6];
w[ 7] = h[7];
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha256_update (&ctx, w, 32);
sha256_final (&ctx);
w[ 0] = ctx.h[0];
w[ 1] = ctx.h[1];
w[ 2] = ctx.h[2];
w[ 3] = ctx.h[3];
w[ 4] = ctx.h[4];
w[ 5] = ctx.h[5];
w[ 6] = ctx.h[6];
w[ 7] = ctx.h[7];
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha256_hmac_ctx_t hmac_ctx;
sha256_hmac_init (&hmac_ctx, w, 32);
sha256_hmac_update_global_swap (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha256_hmac_final (&hmac_ctx);
const u32 r0 = hmac_ctx.opad.h[DGST_R0];
const u32 r1 = hmac_ctx.opad.h[DGST_R1];
const u32 r2 = hmac_ctx.opad.h[DGST_R2];
const u32 r3 = hmac_ctx.opad.h[DGST_R3];
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,495 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha384.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 34
#define SNMPV3_MSG_AUTH_PARAMS_MAX 32
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 128
#define SNMPV3_TMP_ELEMS 8192 // 8192 = (256 (max pw length) * SNMPV3_MAX_PW_LENGTH) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 8
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of SNMPV3_MAX_PW_LENGTH
#define SNMPV3_MAX_ENGINE_ELEMS 32 // 32 * 4 = 128 > 34, also has to be multiple of SNMPV3_MAX_PW_LENGTH
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
#define SNMPV3_MAX_PW_LENGTH_OPT 32
#define SNMPV3_TMP_ELEMS_OPT ((SNMPV3_MAX_PW_LENGTH_OPT * SNMPV3_MAX_PW_LENGTH) / 4)
// (32 * 128) / 4 = 1024
// for pw length > 32 we use global memory reads
typedef struct hmac_sha384_tmp
{
u32 tmp[SNMPV3_TMP_ELEMS];
u64 h[SNMPV3_HASH_ELEMS];
} hmac_sha384_tmp_t;
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
KERNEL_FQ void m26900_init (KERN_ATTR_TMPS_ESALT (hmac_sha384_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32 w[128] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u8 *src_ptr = (u8 *) w;
// password 128 times, also swapped
u32 dst_buf[32];
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 128; i++)
{
for (u32 j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 127;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 127)
{
const int tmp_idx4 = (tmp_idx - 127) / 4;
tmps[gid].tmp[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]);
tmps[gid].tmp[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]);
tmps[gid].tmp[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]);
tmps[gid].tmp[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]);
tmps[gid].tmp[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]);
tmps[gid].tmp[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]);
tmps[gid].tmp[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]);
tmps[gid].tmp[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]);
tmps[gid].tmp[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]);
tmps[gid].tmp[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]);
tmps[gid].tmp[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]);
tmps[gid].tmp[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]);
tmps[gid].tmp[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]);
tmps[gid].tmp[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]);
tmps[gid].tmp[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]);
tmps[gid].tmp[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]);
tmps[gid].tmp[tmp_idx4 + 16] = hc_swap32_S (dst_buf[16]);
tmps[gid].tmp[tmp_idx4 + 17] = hc_swap32_S (dst_buf[17]);
tmps[gid].tmp[tmp_idx4 + 18] = hc_swap32_S (dst_buf[18]);
tmps[gid].tmp[tmp_idx4 + 19] = hc_swap32_S (dst_buf[19]);
tmps[gid].tmp[tmp_idx4 + 20] = hc_swap32_S (dst_buf[20]);
tmps[gid].tmp[tmp_idx4 + 21] = hc_swap32_S (dst_buf[21]);
tmps[gid].tmp[tmp_idx4 + 22] = hc_swap32_S (dst_buf[22]);
tmps[gid].tmp[tmp_idx4 + 23] = hc_swap32_S (dst_buf[23]);
tmps[gid].tmp[tmp_idx4 + 24] = hc_swap32_S (dst_buf[24]);
tmps[gid].tmp[tmp_idx4 + 25] = hc_swap32_S (dst_buf[25]);
tmps[gid].tmp[tmp_idx4 + 26] = hc_swap32_S (dst_buf[26]);
tmps[gid].tmp[tmp_idx4 + 27] = hc_swap32_S (dst_buf[27]);
tmps[gid].tmp[tmp_idx4 + 28] = hc_swap32_S (dst_buf[28]);
tmps[gid].tmp[tmp_idx4 + 29] = hc_swap32_S (dst_buf[29]);
tmps[gid].tmp[tmp_idx4 + 30] = hc_swap32_S (dst_buf[30]);
tmps[gid].tmp[tmp_idx4 + 31] = hc_swap32_S (dst_buf[31]);
}
tmp_idx++;
}
}
// hash
tmps[gid].h[0] = SHA384M_A;
tmps[gid].h[1] = SHA384M_B;
tmps[gid].h[2] = SHA384M_C;
tmps[gid].h[3] = SHA384M_D;
tmps[gid].h[4] = SHA384M_E;
tmps[gid].h[5] = SHA384M_F;
tmps[gid].h[6] = SHA384M_G;
tmps[gid].h[7] = SHA384M_H;
}
KERNEL_FQ void m26900_loop (KERN_ATTR_TMPS_ESALT (hmac_sha384_tmp_t, snmpv3_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u64 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
const u32 pw_len = pws[gid].pw_len;
const int pw_len128 = pw_len * 128;
if (pw_len <= SNMPV3_MAX_PW_LENGTH_OPT)
{
u32 tmp[SNMPV3_TMP_ELEMS_OPT];
for (int i = 0; i < pw_len128 / 4; i++)
{
tmp[i] = tmps[gid].tmp[i];
}
for (u32 i = 0, j = loop_pos; i < loop_cnt; i += 128, j += 128)
{
const int idx = (j % pw_len128) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
w0[0] = tmp[idx + 0];
w0[1] = tmp[idx + 1];
w0[2] = tmp[idx + 2];
w0[3] = tmp[idx + 3];
w1[0] = tmp[idx + 4];
w1[1] = tmp[idx + 5];
w1[2] = tmp[idx + 6];
w1[3] = tmp[idx + 7];
w2[0] = tmp[idx + 8];
w2[1] = tmp[idx + 9];
w2[2] = tmp[idx + 10];
w2[3] = tmp[idx + 11];
w3[0] = tmp[idx + 12];
w3[1] = tmp[idx + 13];
w3[2] = tmp[idx + 14];
w3[3] = tmp[idx + 15];
w4[0] = tmp[idx + 16];
w4[1] = tmp[idx + 17];
w4[2] = tmp[idx + 18];
w4[3] = tmp[idx + 19];
w5[0] = tmp[idx + 20];
w5[1] = tmp[idx + 21];
w5[2] = tmp[idx + 22];
w5[3] = tmp[idx + 23];
w6[0] = tmp[idx + 24];
w6[1] = tmp[idx + 25];
w6[2] = tmp[idx + 26];
w6[3] = tmp[idx + 27];
w7[0] = tmp[idx + 28];
w7[1] = tmp[idx + 29];
w7[2] = tmp[idx + 30];
w7[3] = tmp[idx + 31];
sha384_transform (w0, w1, w2, w3, w4, w5, w6, w7, h);
}
}
else
{
for (u32 i = 0, j = loop_pos; i < loop_cnt; i += 128, j += 128)
{
const int idx = (j % pw_len128) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
w0[0] = tmps[gid].tmp[idx + 0];
w0[1] = tmps[gid].tmp[idx + 1];
w0[2] = tmps[gid].tmp[idx + 2];
w0[3] = tmps[gid].tmp[idx + 3];
w1[0] = tmps[gid].tmp[idx + 4];
w1[1] = tmps[gid].tmp[idx + 5];
w1[2] = tmps[gid].tmp[idx + 6];
w1[3] = tmps[gid].tmp[idx + 7];
w2[0] = tmps[gid].tmp[idx + 8];
w2[1] = tmps[gid].tmp[idx + 9];
w2[2] = tmps[gid].tmp[idx + 10];
w2[3] = tmps[gid].tmp[idx + 11];
w3[0] = tmps[gid].tmp[idx + 12];
w3[1] = tmps[gid].tmp[idx + 13];
w3[2] = tmps[gid].tmp[idx + 14];
w3[3] = tmps[gid].tmp[idx + 15];
w4[0] = tmps[gid].tmp[idx + 16];
w4[1] = tmps[gid].tmp[idx + 17];
w4[2] = tmps[gid].tmp[idx + 18];
w4[3] = tmps[gid].tmp[idx + 19];
w5[0] = tmps[gid].tmp[idx + 20];
w5[1] = tmps[gid].tmp[idx + 21];
w5[2] = tmps[gid].tmp[idx + 22];
w5[3] = tmps[gid].tmp[idx + 23];
w6[0] = tmps[gid].tmp[idx + 24];
w6[1] = tmps[gid].tmp[idx + 25];
w6[2] = tmps[gid].tmp[idx + 26];
w6[3] = tmps[gid].tmp[idx + 27];
w7[0] = tmps[gid].tmp[idx + 28];
w7[1] = tmps[gid].tmp[idx + 29];
w7[2] = tmps[gid].tmp[idx + 30];
w7[3] = tmps[gid].tmp[idx + 31];
sha384_transform (w0, w1, w2, w3, w4, w5, w6, w7, h);
}
}
tmps[gid].h[0] = h[0];
tmps[gid].h[1] = h[1];
tmps[gid].h[2] = h[2];
tmps[gid].h[3] = h[3];
tmps[gid].h[4] = h[4];
tmps[gid].h[5] = h[5];
tmps[gid].h[6] = h[6];
tmps[gid].h[7] = h[7];
}
KERNEL_FQ void m26900_comp (KERN_ATTR_TMPS_ESALT (hmac_sha384_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
w0[0] = 0x80000000;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
w4[0] = 0;
w4[1] = 0;
w4[2] = 0;
w4[3] = 0;
w5[0] = 0;
w5[1] = 0;
w5[2] = 0;
w5[3] = 0;
w6[0] = 0;
w6[1] = 0;
w6[2] = 0;
w6[3] = 0;
w7[0] = 0;
w7[1] = 0;
w7[2] = 0;
w7[3] = 1048576 * 8;
u64 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
sha384_transform (w0, w1, w2, w3, w4, w5, w6, w7, h);
sha384_ctx_t ctx;
sha384_init (&ctx);
u32 w[32];
w[ 0] = h32_from_64_S (h[0]);
w[ 1] = l32_from_64_S (h[0]);
w[ 2] = h32_from_64_S (h[1]);
w[ 3] = l32_from_64_S (h[1]);
w[ 4] = h32_from_64_S (h[2]);
w[ 5] = l32_from_64_S (h[2]);
w[ 6] = h32_from_64_S (h[3]);
w[ 7] = l32_from_64_S (h[3]);
w[ 8] = h32_from_64_S (h[4]);
w[ 9] = l32_from_64_S (h[4]);
w[10] = h32_from_64_S (h[5]);
w[11] = l32_from_64_S (h[5]);
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
w[16] = 0;
w[17] = 0;
w[18] = 0;
w[19] = 0;
w[20] = 0;
w[21] = 0;
w[22] = 0;
w[23] = 0;
w[24] = 0;
w[25] = 0;
w[26] = 0;
w[27] = 0;
w[28] = 0;
w[29] = 0;
w[30] = 0;
w[31] = 0;
sha384_update (&ctx, w, 48);
sha384_update_global_swap (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
w[ 0] = h32_from_64_S (h[0]);
w[ 1] = l32_from_64_S (h[0]);
w[ 2] = h32_from_64_S (h[1]);
w[ 3] = l32_from_64_S (h[1]);
w[ 4] = h32_from_64_S (h[2]);
w[ 5] = l32_from_64_S (h[2]);
w[ 6] = h32_from_64_S (h[3]);
w[ 7] = l32_from_64_S (h[3]);
w[ 8] = h32_from_64_S (h[4]);
w[ 9] = l32_from_64_S (h[4]);
w[10] = h32_from_64_S (h[5]);
w[11] = l32_from_64_S (h[5]);
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
w[16] = 0;
w[17] = 0;
w[18] = 0;
w[19] = 0;
w[20] = 0;
w[21] = 0;
w[22] = 0;
w[23] = 0;
w[24] = 0;
w[25] = 0;
w[26] = 0;
w[27] = 0;
w[28] = 0;
w[29] = 0;
w[30] = 0;
w[31] = 0;
sha384_update (&ctx, w, 48);
sha384_final (&ctx);
w[ 0] = h32_from_64_S (ctx.h[0]);
w[ 1] = l32_from_64_S (ctx.h[0]);
w[ 2] = h32_from_64_S (ctx.h[1]);
w[ 3] = l32_from_64_S (ctx.h[1]);
w[ 4] = h32_from_64_S (ctx.h[2]);
w[ 5] = l32_from_64_S (ctx.h[2]);
w[ 6] = h32_from_64_S (ctx.h[3]);
w[ 7] = l32_from_64_S (ctx.h[3]);
w[ 8] = h32_from_64_S (ctx.h[4]);
w[ 9] = l32_from_64_S (ctx.h[4]);
w[10] = h32_from_64_S (ctx.h[5]);
w[11] = l32_from_64_S (ctx.h[5]);
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
w[16] = 0;
w[17] = 0;
w[18] = 0;
w[19] = 0;
w[20] = 0;
w[21] = 0;
w[22] = 0;
w[23] = 0;
w[24] = 0;
w[25] = 0;
w[26] = 0;
w[27] = 0;
w[28] = 0;
w[29] = 0;
w[30] = 0;
w[31] = 0;
sha384_hmac_ctx_t hmac_ctx;
sha384_hmac_init (&hmac_ctx, w, 48);
sha384_hmac_update_global_swap (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha384_hmac_final (&hmac_ctx);
const u32 r0 = l32_from_64 (hmac_ctx.opad.h[1]);
const u32 r1 = h32_from_64 (hmac_ctx.opad.h[1]);
const u32 r2 = l32_from_64 (hmac_ctx.opad.h[0]);
const u32 r3 = h32_from_64 (hmac_ctx.opad.h[0]);
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,697 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_rp.h"
#include "inc_rp.cl"
#include "inc_scalar.cl"
#include "inc_hash_md4.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define PERM_OP(a,b,tt,n,m) \
{ \
tt = a >> n; \
tt = tt ^ b; \
tt = tt & m; \
b = b ^ tt; \
tt = tt << n; \
a = a ^ tt; \
}
#define HPERM_OP(a,tt,n,m) \
{ \
tt = a << (16 + n); \
tt = tt ^ a; \
tt = tt & m; \
a = a ^ tt; \
tt = tt >> (16 + n); \
a = a ^ tt; \
}
CONSTANT_VK u32a c_SPtrans[8][64] =
{
{
0x02080800, 0x00080000, 0x02000002, 0x02080802,
0x02000000, 0x00080802, 0x00080002, 0x02000002,
0x00080802, 0x02080800, 0x02080000, 0x00000802,
0x02000802, 0x02000000, 0x00000000, 0x00080002,
0x00080000, 0x00000002, 0x02000800, 0x00080800,
0x02080802, 0x02080000, 0x00000802, 0x02000800,
0x00000002, 0x00000800, 0x00080800, 0x02080002,
0x00000800, 0x02000802, 0x02080002, 0x00000000,
0x00000000, 0x02080802, 0x02000800, 0x00080002,
0x02080800, 0x00080000, 0x00000802, 0x02000800,
0x02080002, 0x00000800, 0x00080800, 0x02000002,
0x00080802, 0x00000002, 0x02000002, 0x02080000,
0x02080802, 0x00080800, 0x02080000, 0x02000802,
0x02000000, 0x00000802, 0x00080002, 0x00000000,
0x00080000, 0x02000000, 0x02000802, 0x02080800,
0x00000002, 0x02080002, 0x00000800, 0x00080802,
},
{
0x40108010, 0x00000000, 0x00108000, 0x40100000,
0x40000010, 0x00008010, 0x40008000, 0x00108000,
0x00008000, 0x40100010, 0x00000010, 0x40008000,
0x00100010, 0x40108000, 0x40100000, 0x00000010,
0x00100000, 0x40008010, 0x40100010, 0x00008000,
0x00108010, 0x40000000, 0x00000000, 0x00100010,
0x40008010, 0x00108010, 0x40108000, 0x40000010,
0x40000000, 0x00100000, 0x00008010, 0x40108010,
0x00100010, 0x40108000, 0x40008000, 0x00108010,
0x40108010, 0x00100010, 0x40000010, 0x00000000,
0x40000000, 0x00008010, 0x00100000, 0x40100010,
0x00008000, 0x40000000, 0x00108010, 0x40008010,
0x40108000, 0x00008000, 0x00000000, 0x40000010,
0x00000010, 0x40108010, 0x00108000, 0x40100000,
0x40100010, 0x00100000, 0x00008010, 0x40008000,
0x40008010, 0x00000010, 0x40100000, 0x00108000,
},
{
0x04000001, 0x04040100, 0x00000100, 0x04000101,
0x00040001, 0x04000000, 0x04000101, 0x00040100,
0x04000100, 0x00040000, 0x04040000, 0x00000001,
0x04040101, 0x00000101, 0x00000001, 0x04040001,
0x00000000, 0x00040001, 0x04040100, 0x00000100,
0x00000101, 0x04040101, 0x00040000, 0x04000001,
0x04040001, 0x04000100, 0x00040101, 0x04040000,
0x00040100, 0x00000000, 0x04000000, 0x00040101,
0x04040100, 0x00000100, 0x00000001, 0x00040000,
0x00000101, 0x00040001, 0x04040000, 0x04000101,
0x00000000, 0x04040100, 0x00040100, 0x04040001,
0x00040001, 0x04000000, 0x04040101, 0x00000001,
0x00040101, 0x04000001, 0x04000000, 0x04040101,
0x00040000, 0x04000100, 0x04000101, 0x00040100,
0x04000100, 0x00000000, 0x04040001, 0x00000101,
0x04000001, 0x00040101, 0x00000100, 0x04040000,
},
{
0x00401008, 0x10001000, 0x00000008, 0x10401008,
0x00000000, 0x10400000, 0x10001008, 0x00400008,
0x10401000, 0x10000008, 0x10000000, 0x00001008,
0x10000008, 0x00401008, 0x00400000, 0x10000000,
0x10400008, 0x00401000, 0x00001000, 0x00000008,
0x00401000, 0x10001008, 0x10400000, 0x00001000,
0x00001008, 0x00000000, 0x00400008, 0x10401000,
0x10001000, 0x10400008, 0x10401008, 0x00400000,
0x10400008, 0x00001008, 0x00400000, 0x10000008,
0x00401000, 0x10001000, 0x00000008, 0x10400000,
0x10001008, 0x00000000, 0x00001000, 0x00400008,
0x00000000, 0x10400008, 0x10401000, 0x00001000,
0x10000000, 0x10401008, 0x00401008, 0x00400000,
0x10401008, 0x00000008, 0x10001000, 0x00401008,
0x00400008, 0x00401000, 0x10400000, 0x10001008,
0x00001008, 0x10000000, 0x10000008, 0x10401000,
},
{
0x08000000, 0x00010000, 0x00000400, 0x08010420,
0x08010020, 0x08000400, 0x00010420, 0x08010000,
0x00010000, 0x00000020, 0x08000020, 0x00010400,
0x08000420, 0x08010020, 0x08010400, 0x00000000,
0x00010400, 0x08000000, 0x00010020, 0x00000420,
0x08000400, 0x00010420, 0x00000000, 0x08000020,
0x00000020, 0x08000420, 0x08010420, 0x00010020,
0x08010000, 0x00000400, 0x00000420, 0x08010400,
0x08010400, 0x08000420, 0x00010020, 0x08010000,
0x00010000, 0x00000020, 0x08000020, 0x08000400,
0x08000000, 0x00010400, 0x08010420, 0x00000000,
0x00010420, 0x08000000, 0x00000400, 0x00010020,
0x08000420, 0x00000400, 0x00000000, 0x08010420,
0x08010020, 0x08010400, 0x00000420, 0x00010000,
0x00010400, 0x08010020, 0x08000400, 0x00000420,
0x00000020, 0x00010420, 0x08010000, 0x08000020,
},
{
0x80000040, 0x00200040, 0x00000000, 0x80202000,
0x00200040, 0x00002000, 0x80002040, 0x00200000,
0x00002040, 0x80202040, 0x00202000, 0x80000000,
0x80002000, 0x80000040, 0x80200000, 0x00202040,
0x00200000, 0x80002040, 0x80200040, 0x00000000,
0x00002000, 0x00000040, 0x80202000, 0x80200040,
0x80202040, 0x80200000, 0x80000000, 0x00002040,
0x00000040, 0x00202000, 0x00202040, 0x80002000,
0x00002040, 0x80000000, 0x80002000, 0x00202040,
0x80202000, 0x00200040, 0x00000000, 0x80002000,
0x80000000, 0x00002000, 0x80200040, 0x00200000,
0x00200040, 0x80202040, 0x00202000, 0x00000040,
0x80202040, 0x00202000, 0x00200000, 0x80002040,
0x80000040, 0x80200000, 0x00202040, 0x00000000,
0x00002000, 0x80000040, 0x80002040, 0x80202000,
0x80200000, 0x00002040, 0x00000040, 0x80200040,
},
{
0x00004000, 0x00000200, 0x01000200, 0x01000004,
0x01004204, 0x00004004, 0x00004200, 0x00000000,
0x01000000, 0x01000204, 0x00000204, 0x01004000,
0x00000004, 0x01004200, 0x01004000, 0x00000204,
0x01000204, 0x00004000, 0x00004004, 0x01004204,
0x00000000, 0x01000200, 0x01000004, 0x00004200,
0x01004004, 0x00004204, 0x01004200, 0x00000004,
0x00004204, 0x01004004, 0x00000200, 0x01000000,
0x00004204, 0x01004000, 0x01004004, 0x00000204,
0x00004000, 0x00000200, 0x01000000, 0x01004004,
0x01000204, 0x00004204, 0x00004200, 0x00000000,
0x00000200, 0x01000004, 0x00000004, 0x01000200,
0x00000000, 0x01000204, 0x01000200, 0x00004200,
0x00000204, 0x00004000, 0x01004204, 0x01000000,
0x01004200, 0x00000004, 0x00004004, 0x01004204,
0x01000004, 0x01004200, 0x01004000, 0x00004004,
},
{
0x20800080, 0x20820000, 0x00020080, 0x00000000,
0x20020000, 0x00800080, 0x20800000, 0x20820080,
0x00000080, 0x20000000, 0x00820000, 0x00020080,
0x00820080, 0x20020080, 0x20000080, 0x20800000,
0x00020000, 0x00820080, 0x00800080, 0x20020000,
0x20820080, 0x20000080, 0x00000000, 0x00820000,
0x20000000, 0x00800000, 0x20020080, 0x20800080,
0x00800000, 0x00020000, 0x20820000, 0x00000080,
0x00800000, 0x00020000, 0x20000080, 0x20820080,
0x00020080, 0x20000000, 0x00000000, 0x00820000,
0x20800080, 0x20020080, 0x20020000, 0x00800080,
0x20820000, 0x00000080, 0x00800080, 0x20020000,
0x20820080, 0x00800000, 0x20800000, 0x20000080,
0x00820000, 0x00020080, 0x20020080, 0x20800000,
0x00000080, 0x20820000, 0x00820080, 0x00000000,
0x20000000, 0x20800080, 0x00020000, 0x00820080,
}
};
CONSTANT_VK u32a c_skb[8][64] =
{
{
0x00000000, 0x00000010, 0x20000000, 0x20000010,
0x00010000, 0x00010010, 0x20010000, 0x20010010,
0x00000800, 0x00000810, 0x20000800, 0x20000810,
0x00010800, 0x00010810, 0x20010800, 0x20010810,
0x00000020, 0x00000030, 0x20000020, 0x20000030,
0x00010020, 0x00010030, 0x20010020, 0x20010030,
0x00000820, 0x00000830, 0x20000820, 0x20000830,
0x00010820, 0x00010830, 0x20010820, 0x20010830,
0x00080000, 0x00080010, 0x20080000, 0x20080010,
0x00090000, 0x00090010, 0x20090000, 0x20090010,
0x00080800, 0x00080810, 0x20080800, 0x20080810,
0x00090800, 0x00090810, 0x20090800, 0x20090810,
0x00080020, 0x00080030, 0x20080020, 0x20080030,
0x00090020, 0x00090030, 0x20090020, 0x20090030,
0x00080820, 0x00080830, 0x20080820, 0x20080830,
0x00090820, 0x00090830, 0x20090820, 0x20090830,
},
{
0x00000000, 0x02000000, 0x00002000, 0x02002000,
0x00200000, 0x02200000, 0x00202000, 0x02202000,
0x00000004, 0x02000004, 0x00002004, 0x02002004,
0x00200004, 0x02200004, 0x00202004, 0x02202004,
0x00000400, 0x02000400, 0x00002400, 0x02002400,
0x00200400, 0x02200400, 0x00202400, 0x02202400,
0x00000404, 0x02000404, 0x00002404, 0x02002404,
0x00200404, 0x02200404, 0x00202404, 0x02202404,
0x10000000, 0x12000000, 0x10002000, 0x12002000,
0x10200000, 0x12200000, 0x10202000, 0x12202000,
0x10000004, 0x12000004, 0x10002004, 0x12002004,
0x10200004, 0x12200004, 0x10202004, 0x12202004,
0x10000400, 0x12000400, 0x10002400, 0x12002400,
0x10200400, 0x12200400, 0x10202400, 0x12202400,
0x10000404, 0x12000404, 0x10002404, 0x12002404,
0x10200404, 0x12200404, 0x10202404, 0x12202404,
},
{
0x00000000, 0x00000001, 0x00040000, 0x00040001,
0x01000000, 0x01000001, 0x01040000, 0x01040001,
0x00000002, 0x00000003, 0x00040002, 0x00040003,
0x01000002, 0x01000003, 0x01040002, 0x01040003,
0x00000200, 0x00000201, 0x00040200, 0x00040201,
0x01000200, 0x01000201, 0x01040200, 0x01040201,
0x00000202, 0x00000203, 0x00040202, 0x00040203,
0x01000202, 0x01000203, 0x01040202, 0x01040203,
0x08000000, 0x08000001, 0x08040000, 0x08040001,
0x09000000, 0x09000001, 0x09040000, 0x09040001,
0x08000002, 0x08000003, 0x08040002, 0x08040003,
0x09000002, 0x09000003, 0x09040002, 0x09040003,
0x08000200, 0x08000201, 0x08040200, 0x08040201,
0x09000200, 0x09000201, 0x09040200, 0x09040201,
0x08000202, 0x08000203, 0x08040202, 0x08040203,
0x09000202, 0x09000203, 0x09040202, 0x09040203,
},
{
0x00000000, 0x00100000, 0x00000100, 0x00100100,
0x00000008, 0x00100008, 0x00000108, 0x00100108,
0x00001000, 0x00101000, 0x00001100, 0x00101100,
0x00001008, 0x00101008, 0x00001108, 0x00101108,
0x04000000, 0x04100000, 0x04000100, 0x04100100,
0x04000008, 0x04100008, 0x04000108, 0x04100108,
0x04001000, 0x04101000, 0x04001100, 0x04101100,
0x04001008, 0x04101008, 0x04001108, 0x04101108,
0x00020000, 0x00120000, 0x00020100, 0x00120100,
0x00020008, 0x00120008, 0x00020108, 0x00120108,
0x00021000, 0x00121000, 0x00021100, 0x00121100,
0x00021008, 0x00121008, 0x00021108, 0x00121108,
0x04020000, 0x04120000, 0x04020100, 0x04120100,
0x04020008, 0x04120008, 0x04020108, 0x04120108,
0x04021000, 0x04121000, 0x04021100, 0x04121100,
0x04021008, 0x04121008, 0x04021108, 0x04121108,
},
{
0x00000000, 0x10000000, 0x00010000, 0x10010000,
0x00000004, 0x10000004, 0x00010004, 0x10010004,
0x20000000, 0x30000000, 0x20010000, 0x30010000,
0x20000004, 0x30000004, 0x20010004, 0x30010004,
0x00100000, 0x10100000, 0x00110000, 0x10110000,
0x00100004, 0x10100004, 0x00110004, 0x10110004,
0x20100000, 0x30100000, 0x20110000, 0x30110000,
0x20100004, 0x30100004, 0x20110004, 0x30110004,
0x00001000, 0x10001000, 0x00011000, 0x10011000,
0x00001004, 0x10001004, 0x00011004, 0x10011004,
0x20001000, 0x30001000, 0x20011000, 0x30011000,
0x20001004, 0x30001004, 0x20011004, 0x30011004,
0x00101000, 0x10101000, 0x00111000, 0x10111000,
0x00101004, 0x10101004, 0x00111004, 0x10111004,
0x20101000, 0x30101000, 0x20111000, 0x30111000,
0x20101004, 0x30101004, 0x20111004, 0x30111004,
},
{
0x00000000, 0x08000000, 0x00000008, 0x08000008,
0x00000400, 0x08000400, 0x00000408, 0x08000408,
0x00020000, 0x08020000, 0x00020008, 0x08020008,
0x00020400, 0x08020400, 0x00020408, 0x08020408,
0x00000001, 0x08000001, 0x00000009, 0x08000009,
0x00000401, 0x08000401, 0x00000409, 0x08000409,
0x00020001, 0x08020001, 0x00020009, 0x08020009,
0x00020401, 0x08020401, 0x00020409, 0x08020409,
0x02000000, 0x0A000000, 0x02000008, 0x0A000008,
0x02000400, 0x0A000400, 0x02000408, 0x0A000408,
0x02020000, 0x0A020000, 0x02020008, 0x0A020008,
0x02020400, 0x0A020400, 0x02020408, 0x0A020408,
0x02000001, 0x0A000001, 0x02000009, 0x0A000009,
0x02000401, 0x0A000401, 0x02000409, 0x0A000409,
0x02020001, 0x0A020001, 0x02020009, 0x0A020009,
0x02020401, 0x0A020401, 0x02020409, 0x0A020409,
},
{
0x00000000, 0x00000100, 0x00080000, 0x00080100,
0x01000000, 0x01000100, 0x01080000, 0x01080100,
0x00000010, 0x00000110, 0x00080010, 0x00080110,
0x01000010, 0x01000110, 0x01080010, 0x01080110,
0x00200000, 0x00200100, 0x00280000, 0x00280100,
0x01200000, 0x01200100, 0x01280000, 0x01280100,
0x00200010, 0x00200110, 0x00280010, 0x00280110,
0x01200010, 0x01200110, 0x01280010, 0x01280110,
0x00000200, 0x00000300, 0x00080200, 0x00080300,
0x01000200, 0x01000300, 0x01080200, 0x01080300,
0x00000210, 0x00000310, 0x00080210, 0x00080310,
0x01000210, 0x01000310, 0x01080210, 0x01080310,
0x00200200, 0x00200300, 0x00280200, 0x00280300,
0x01200200, 0x01200300, 0x01280200, 0x01280300,
0x00200210, 0x00200310, 0x00280210, 0x00280310,
0x01200210, 0x01200310, 0x01280210, 0x01280310,
},
{
0x00000000, 0x04000000, 0x00040000, 0x04040000,
0x00000002, 0x04000002, 0x00040002, 0x04040002,
0x00002000, 0x04002000, 0x00042000, 0x04042000,
0x00002002, 0x04002002, 0x00042002, 0x04042002,
0x00000020, 0x04000020, 0x00040020, 0x04040020,
0x00000022, 0x04000022, 0x00040022, 0x04040022,
0x00002020, 0x04002020, 0x00042020, 0x04042020,
0x00002022, 0x04002022, 0x00042022, 0x04042022,
0x00000800, 0x04000800, 0x00040800, 0x04040800,
0x00000802, 0x04000802, 0x00040802, 0x04040802,
0x00002800, 0x04002800, 0x00042800, 0x04042800,
0x00002802, 0x04002802, 0x00042802, 0x04042802,
0x00000820, 0x04000820, 0x00040820, 0x04040820,
0x00000822, 0x04000822, 0x00040822, 0x04040822,
0x00002820, 0x04002820, 0x00042820, 0x04042820,
0x00002822, 0x04002822, 0x00042822, 0x04042822
}
};
#if VECT_SIZE == 1
#define BOX(i,n,S) (S)[(n)][(i)]
#elif VECT_SIZE == 2
#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1])
#elif VECT_SIZE == 4
#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3])
#elif VECT_SIZE == 8
#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7])
#elif VECT_SIZE == 16
#define BOX(i,n,S) make_u32x ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf])
#endif
DECLSPEC void _des_crypt_encrypt (u32 *iv, u32 *data, u32 *Kc, u32 *Kd, SHM_TYPE u32 (*s_SPtrans)[64])
{
u32 r = data[0];
u32 l = data[1];
#ifdef _unroll
#pragma unroll
#endif
for (u32 i = 0; i < 16; i += 2)
{
u32 u;
u32 t;
u = Kc[i + 0] ^ hc_rotl32 (r, 30u);
t = Kd[i + 0] ^ hc_rotl32 (r, 26u);
l ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans)
| BOX (((u >> 8) & 0x3f), 2, s_SPtrans)
| BOX (((u >> 16) & 0x3f), 4, s_SPtrans)
| BOX (((u >> 24) & 0x3f), 6, s_SPtrans)
| BOX (((t >> 0) & 0x3f), 1, s_SPtrans)
| BOX (((t >> 8) & 0x3f), 3, s_SPtrans)
| BOX (((t >> 16) & 0x3f), 5, s_SPtrans)
| BOX (((t >> 24) & 0x3f), 7, s_SPtrans);
u = Kc[i + 1] ^ hc_rotl32 (l, 30u);
t = Kd[i + 1] ^ hc_rotl32 (l, 26u);
r ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans)
| BOX (((u >> 8) & 0x3f), 2, s_SPtrans)
| BOX (((u >> 16) & 0x3f), 4, s_SPtrans)
| BOX (((u >> 24) & 0x3f), 6, s_SPtrans)
| BOX (((t >> 0) & 0x3f), 1, s_SPtrans)
| BOX (((t >> 8) & 0x3f), 3, s_SPtrans)
| BOX (((t >> 16) & 0x3f), 5, s_SPtrans)
| BOX (((t >> 24) & 0x3f), 7, s_SPtrans);
}
iv[0] = l;
iv[1] = r;
}
DECLSPEC void _des_crypt_keysetup (u32 c, u32 d, u32 *Kc, u32 *Kd, SHM_TYPE u32 (*s_skb)[64])
{
u32 tt;
PERM_OP (d, c, tt, 4, 0x0f0f0f0f);
HPERM_OP (c, tt, 2, 0xcccc0000);
HPERM_OP (d, tt, 2, 0xcccc0000);
PERM_OP (d, c, tt, 1, 0x55555555);
PERM_OP (c, d, tt, 8, 0x00ff00ff);
PERM_OP (d, c, tt, 1, 0x55555555);
d = ((d & 0x000000ff) << 16)
| ((d & 0x0000ff00) << 0)
| ((d & 0x00ff0000) >> 16)
| ((c & 0xf0000000) >> 4);
c = c & 0x0fffffff;
#ifdef _unroll
#pragma unroll
#endif
for (u32 i = 0; i < 16; i++)
{
if ((i < 2) || (i == 8) || (i == 15))
{
c = ((c >> 1) | (c << 27));
d = ((d >> 1) | (d << 27));
}
else
{
c = ((c >> 2) | (c << 26));
d = ((d >> 2) | (d << 26));
}
c = c & 0x0fffffff;
d = d & 0x0fffffff;
const u32 c00 = (c >> 0) & 0x0000003f;
const u32 c06 = (c >> 6) & 0x00383003;
const u32 c07 = (c >> 7) & 0x0000003c;
const u32 c13 = (c >> 13) & 0x0000060f;
const u32 c20 = (c >> 20) & 0x00000001;
u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb)
| BOX (((c06 >> 0) & 0xff)
|((c07 >> 0) & 0xff), 1, s_skb)
| BOX (((c13 >> 0) & 0xff)
|((c06 >> 8) & 0xff), 2, s_skb)
| BOX (((c20 >> 0) & 0xff)
|((c13 >> 8) & 0xff)
|((c06 >> 16) & 0xff), 3, s_skb);
const u32 d00 = (d >> 0) & 0x00003c3f;
const u32 d07 = (d >> 7) & 0x00003f03;
const u32 d21 = (d >> 21) & 0x0000000f;
const u32 d22 = (d >> 22) & 0x00000030;
u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb)
| BOX (((d07 >> 0) & 0xff)
|((d00 >> 8) & 0xff), 5, s_skb)
| BOX (((d07 >> 8) & 0xff), 6, s_skb)
| BOX (((d21 >> 0) & 0xff)
|((d22 >> 0) & 0xff), 7, s_skb);
Kc[i] = ((t << 16) | (s & 0x0000ffff));
Kd[i] = ((s >> 16) | (t & 0xffff0000));
}
}
DECLSPEC void transform_netntlmv1_key (const u32 w0, const u32 w1, u32 *out)
{
u32 t[8];
t[0] = (w0 >> 0) & 0xff;
t[1] = (w0 >> 8) & 0xff;
t[2] = (w0 >> 16) & 0xff;
t[3] = (w0 >> 24) & 0xff;
t[4] = (w1 >> 0) & 0xff;
t[5] = (w1 >> 8) & 0xff;
t[6] = (w1 >> 16) & 0xff;
t[7] = (w1 >> 24) & 0xff;
u32 k[8];
k[0] = (t[0] >> 0);
k[1] = (t[0] << 7) | (t[1] >> 1);
k[2] = (t[1] << 6) | (t[2] >> 2);
k[3] = (t[2] << 5) | (t[3] >> 3);
k[4] = (t[3] << 4) | (t[4] >> 4);
k[5] = (t[4] << 3) | (t[5] >> 5);
k[6] = (t[5] << 2) | (t[6] >> 6);
k[7] = (t[6] << 1);
out[0] = ((k[0] & 0xff) << 0)
| ((k[1] & 0xff) << 8)
| ((k[2] & 0xff) << 16)
| ((k[3] & 0xff) << 24);
out[1] = ((k[4] & 0xff) << 0)
| ((k[5] & 0xff) << 8)
| ((k[6] & 0xff) << 16)
| ((k[7] & 0xff) << 24);
}
#ifdef KERNEL_STATIC
DECLSPEC u8 hex_convert (const u8 c)
{
return (c & 15) + (c >> 6) * 9;
}
DECLSPEC u8 hex_to_u8 (const u8 *hex)
{
u8 v = 0;
v |= ((u8) hex_convert (hex[1]) << 0);
v |= ((u8) hex_convert (hex[0]) << 4);
return (v);
}
#endif
typedef struct netntlm
{
u32 user_len;
u32 domain_len;
u32 srvchall_len;
u32 clichall_len;
u32 userdomain_buf[64];
u32 chall_buf[256];
} netntlm_t;
typedef struct netntlm_tmp
{
u32 digest_buf[4];
} netntlm_tmp_t;
KERNEL_FQ void m27000_init (KERN_ATTR_TMPS_ESALT (netntlm_tmp_t, netntlm_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
const u64 lid = get_local_id (0);
const u64 lsz = get_local_size (0);
if (gid >= gid_max) return;
/**
* salt
*/
u32 in[16];
in[ 0] = pws[gid].i[ 0];
in[ 1] = pws[gid].i[ 1];
in[ 2] = pws[gid].i[ 2];
in[ 3] = pws[gid].i[ 3];
in[ 4] = pws[gid].i[ 4];
in[ 5] = pws[gid].i[ 5];
in[ 6] = pws[gid].i[ 6];
in[ 7] = pws[gid].i[ 7];
u8 *in_ptr = (u8 *) in;
u32 out[4];
u8 *out_ptr = (u8 *) out;
for (int i = 0, j = 0; i < 16; i += 1, j += 2)
{
out_ptr[i] = hex_to_u8 (in_ptr + j);
}
tmps[gid].digest_buf[0] = out[ 0];
tmps[gid].digest_buf[1] = out[ 1];
tmps[gid].digest_buf[2] = out[ 2];
tmps[gid].digest_buf[3] = out[ 3];
}
KERNEL_FQ void m27000_loop (KERN_ATTR_TMPS_ESALT (netntlm_tmp_t, netntlm_t))
{
}
KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (netntlm_tmp_t, netntlm_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
const u64 lid = get_local_id (0);
const u64 lsz = get_local_size (0);
/**
* sbox, kbox
*/
#ifdef REAL_SHM
LOCAL_VK u32 s_SPtrans[8][64];
LOCAL_VK u32 s_skb[8][64];
for (u32 i = lid; i < 64; i += lsz)
{
s_SPtrans[0][i] = c_SPtrans[0][i];
s_SPtrans[1][i] = c_SPtrans[1][i];
s_SPtrans[2][i] = c_SPtrans[2][i];
s_SPtrans[3][i] = c_SPtrans[3][i];
s_SPtrans[4][i] = c_SPtrans[4][i];
s_SPtrans[5][i] = c_SPtrans[5][i];
s_SPtrans[6][i] = c_SPtrans[6][i];
s_SPtrans[7][i] = c_SPtrans[7][i];
s_skb[0][i] = c_skb[0][i];
s_skb[1][i] = c_skb[1][i];
s_skb[2][i] = c_skb[2][i];
s_skb[3][i] = c_skb[3][i];
s_skb[4][i] = c_skb[4][i];
s_skb[5][i] = c_skb[5][i];
s_skb[6][i] = c_skb[6][i];
s_skb[7][i] = c_skb[7][i];
}
SYNC_THREADS ();
#else
CONSTANT_AS u32a (*s_SPtrans)[64] = c_SPtrans;
CONSTANT_AS u32a (*s_skb)[64] = c_skb;
#endif
if (gid >= gid_max) return;
/**
* base
*/
const u32 s0 = salt_bufs[SALT_POS].salt_buf[0];
const u32 s1 = salt_bufs[SALT_POS].salt_buf[1];
const u32 s2 = salt_bufs[SALT_POS].salt_buf[2];
const u32 a = tmps[gid].digest_buf[0];
const u32 b = tmps[gid].digest_buf[1];
const u32 c = tmps[gid].digest_buf[2];
const u32 d = tmps[gid].digest_buf[3];
// I believe this matches the last 2 bytes and throws away.
// Taken from 5500.
if ((d >> 16) != s2) return;
/**
* DES1
*/
u32 key[2];
transform_netntlmv1_key (a, b, key);
u32 Kc[16];
u32 Kd[16];
_des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb);
u32 data[2];
data[0] = s0;
data[1] = s1;
u32 out1[2];
_des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans);
/**
* DES2
*/
transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key);
_des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb);
u32 out2[2];
_des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans);
/**
* digest
*/
const u32 r0 = out1[0];
const u32 r1 = out1[1];
const u32 r2 = out2[0];
const u32 r3 = out2[1];
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,197 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
// #define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_rp.h"
#include "inc_rp.cl"
#include "inc_scalar.cl"
#include "inc_hash_md4.cl"
#include "inc_hash_md5.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#ifdef KERNEL_STATIC
DECLSPEC u8 hex_convert (const u8 c)
{
return (c & 15) + (c >> 6) * 9;
}
DECLSPEC u8 hex_to_u8 (const u8 *hex)
{
u8 v = 0;
v |= ((u8) hex_convert (hex[1]) << 0);
v |= ((u8) hex_convert (hex[0]) << 4);
return (v);
}
#endif
typedef struct netntlm
{
u32 user_len;
u32 domain_len;
u32 srvchall_len;
u32 clichall_len;
u32 userdomain_buf[64];
u32 chall_buf[256];
} netntlm_t;
typedef struct netntlmv2_tmp
{
u32 digest_buf[4];
} netntlm_tmp_t;
KERNEL_FQ void m27100_init (KERN_ATTR_TMPS_ESALT (netntlm_tmp_t, netntlm_t))
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
u32 in[16];
in[ 0] = pws[gid].i[ 0];
in[ 1] = pws[gid].i[ 1];
in[ 2] = pws[gid].i[ 2];
in[ 3] = pws[gid].i[ 3];
in[ 4] = pws[gid].i[ 4];
in[ 5] = pws[gid].i[ 5];
in[ 6] = pws[gid].i[ 6];
in[ 7] = pws[gid].i[ 7];
u8 *in_ptr = (u8 *) in;
u32 out[4];
u8 *out_ptr = (u8 *) out;
for (int i = 0, j = 0; i < 16; i += 1, j += 2)
{
out_ptr[i] = hex_to_u8 (in_ptr + j);
}
tmps[gid].digest_buf[0] = out[ 0];
tmps[gid].digest_buf[1] = out[ 1];
tmps[gid].digest_buf[2] = out[ 2];
tmps[gid].digest_buf[3] = out[ 3];
}
KERNEL_FQ void m27100_loop (KERN_ATTR_TMPS_ESALT (netntlm_tmp_t, netntlm_t))
{
}
KERNEL_FQ void m27100_comp (KERN_ATTR_TMPS_ESALT (netntlm_tmp_t, netntlm_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
const u64 lid = get_local_id (0);
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmps[gid].digest_buf[0];
w0[1] = tmps[gid].digest_buf[1];
w0[2] = tmps[gid].digest_buf[2];
w0[3] = tmps[gid].digest_buf[3];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
md5_hmac_ctx_t ctx0;
md5_hmac_init_64 (&ctx0, w0, w1, w2, w3);
md5_hmac_update_global (&ctx0, esalt_bufs[DIGESTS_OFFSET].userdomain_buf, esalt_bufs[DIGESTS_OFFSET].user_len + esalt_bufs[DIGESTS_OFFSET].domain_len);
md5_hmac_final (&ctx0);
w0[0] = ctx0.opad.h[0];
w0[1] = ctx0.opad.h[1];
w0[2] = ctx0.opad.h[2];
w0[3] = ctx0.opad.h[3];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
md5_hmac_ctx_t ctx;
md5_hmac_init_64 (&ctx, w0, w1, w2, w3);
md5_hmac_update_global (&ctx, esalt_bufs[DIGESTS_OFFSET].chall_buf, esalt_bufs[DIGESTS_OFFSET].srvchall_len + esalt_bufs[DIGESTS_OFFSET].clichall_len);
md5_hmac_final (&ctx);
tmps[gid].digest_buf[0] = ctx.opad.h[0];
tmps[gid].digest_buf[1] = ctx.opad.h[1];
tmps[gid].digest_buf[2] = ctx.opad.h[2];
tmps[gid].digest_buf[3] = ctx.opad.h[3];
/**
* digest
*/
const u32 r0 = ctx.opad.h[DGST_R0];
const u32 r1 = ctx.opad.h[DGST_R1];
const u32 r2 = ctx.opad.h[DGST_R2];
const u32 r3 = ctx.opad.h[DGST_R3];
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,513 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_rp_optimized.h"
#include "inc_rp_optimized.cl"
#include "inc_simd.cl"
#include "inc_hash_sha1.cl"
#endif
KERNEL_FQ void m27200_m04 (KERN_ATTR_RULES ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 pw_buf0[4];
u32 pw_buf1[4];
pw_buf0[0] = pws[gid].i[0];
pw_buf0[1] = pws[gid].i[1];
pw_buf0[2] = pws[gid].i[2];
pw_buf0[3] = pws[gid].i[3];
pw_buf1[0] = pws[gid].i[4];
pw_buf1[1] = pws[gid].i[5];
pw_buf1[2] = pws[gid].i[6];
pw_buf1[3] = pws[gid].i[7];
const u32 pw_len = pws[gid].pw_len & 63;
/**
* salt
*/
u32 salt_buf0[4] = { 0x2d2d, 0, 0, 0 };
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4] = { 0 };
salt_buf0[0] |= salt_bufs[SALT_POS].salt_buf[ 0] << 16;
salt_buf0[1] = salt_bufs[SALT_POS].salt_buf[ 0] >> 16 | salt_bufs[SALT_POS].salt_buf[ 1] << 16;
salt_buf0[2] = salt_bufs[SALT_POS].salt_buf[ 1] >> 16 | salt_bufs[SALT_POS].salt_buf[ 2] << 16;
salt_buf0[3] = salt_bufs[SALT_POS].salt_buf[ 2] >> 16 | salt_bufs[SALT_POS].salt_buf[ 3] << 16;
salt_buf1[0] = salt_bufs[SALT_POS].salt_buf[ 3] >> 16 | salt_bufs[SALT_POS].salt_buf[ 4] << 16;
salt_buf1[1] = salt_bufs[SALT_POS].salt_buf[ 4] >> 16 | salt_bufs[SALT_POS].salt_buf[ 5] << 16;
salt_buf1[2] = salt_bufs[SALT_POS].salt_buf[ 5] >> 16 | salt_bufs[SALT_POS].salt_buf[ 6] << 16;
salt_buf1[3] = salt_bufs[SALT_POS].salt_buf[ 6] >> 16 | salt_bufs[SALT_POS].salt_buf[ 7] << 16;
salt_buf2[0] = salt_bufs[SALT_POS].salt_buf[ 7] >> 16 | salt_bufs[SALT_POS].salt_buf[ 8] << 16;
salt_buf2[1] = salt_bufs[SALT_POS].salt_buf[ 8] >> 16 | salt_bufs[SALT_POS].salt_buf[ 9] << 16;
salt_buf2[2] = salt_bufs[SALT_POS].salt_buf[ 9] >> 16 | 0x2d2d0000;
salt_buf2[3] = 0;
const u32 salt_len = 44; //salt_bufs[SALT_POS].salt_len;
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
u32x w0[4] = { 0 };
u32x w1[4] = { 0 };
u32x w2[4] = { 0 };
u32x w3[4] = { 0 };
const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
/**
* prepend salt
*/
const u32x out_salt_len = out_len + salt_len;
switch_buffer_by_offset_le_VV (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
w0[2] |= salt_buf0[2];
w0[3] |= salt_buf0[3];
w1[0] |= salt_buf1[0];
w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3];
w2[0] |= salt_buf2[0];
w2[1] |= salt_buf2[1];
w2[2] |= salt_buf2[2];
w2[3] |= salt_buf2[3];
w3[0] |= salt_buf3[0];
w3[1] |= salt_buf3[1];
w3[2] |= salt_buf3[2];
w3[3] |= salt_buf3[3];
append_0x2d_4x4_VV (w0, w1, w2, w3, out_salt_len);
append_0x2d_4x4_VV (w0, w1, w2, w3, out_salt_len + 1);
append_0x80_4x4_VV (w0, w1, w2, w3, out_salt_len + 2);
/**
* sha1
*/
u32x w0_t = hc_swap32 (w0[0]);
u32x w1_t = hc_swap32 (w0[1]);
u32x w2_t = hc_swap32 (w0[2]);
u32x w3_t = hc_swap32 (w0[3]);
u32x w4_t = hc_swap32 (w1[0]);
u32x w5_t = hc_swap32 (w1[1]);
u32x w6_t = hc_swap32 (w1[2]);
u32x w7_t = hc_swap32 (w1[3]);
u32x w8_t = hc_swap32 (w2[0]);
u32x w9_t = hc_swap32 (w2[1]);
u32x wa_t = hc_swap32 (w2[2]);
u32x wb_t = hc_swap32 (w2[3]);
u32x wc_t = hc_swap32 (w3[0]);
u32x wd_t = hc_swap32 (w3[1]);
u32x we_t = 0;
u32x wf_t = (out_salt_len + 2) * 8;
u32x a = SHA1M_A;
u32x b = SHA1M_B;
u32x c = SHA1M_C;
u32x d = SHA1M_D;
u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
#undef K
#define K SHA1C01
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
#undef K
#define K SHA1C02
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
#undef K
#define K SHA1C03
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_M_SIMD (d, e, c, b);
}
}
KERNEL_FQ void m27200_m08 (KERN_ATTR_RULES ())
{
}
KERNEL_FQ void m27200_m16 (KERN_ATTR_RULES ())
{
}
KERNEL_FQ void m27200_s04 (KERN_ATTR_RULES ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 pw_buf0[4];
u32 pw_buf1[4];
pw_buf0[0] = pws[gid].i[0];
pw_buf0[1] = pws[gid].i[1];
pw_buf0[2] = pws[gid].i[2];
pw_buf0[3] = pws[gid].i[3];
pw_buf1[0] = pws[gid].i[4];
pw_buf1[1] = pws[gid].i[5];
pw_buf1[2] = pws[gid].i[6];
pw_buf1[3] = pws[gid].i[7];
const u32 pw_len = pws[gid].pw_len & 63;
/**
* salt
*/
u32 salt_buf0[4] = { 0x2d2d, 0, 0, 0 };
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4] = { 0 };
salt_buf0[0] |= salt_bufs[SALT_POS].salt_buf[ 0] << 16;
salt_buf0[1] = salt_bufs[SALT_POS].salt_buf[ 0] >> 16 | salt_bufs[SALT_POS].salt_buf[ 1] << 16;
salt_buf0[2] = salt_bufs[SALT_POS].salt_buf[ 1] >> 16 | salt_bufs[SALT_POS].salt_buf[ 2] << 16;
salt_buf0[3] = salt_bufs[SALT_POS].salt_buf[ 2] >> 16 | salt_bufs[SALT_POS].salt_buf[ 3] << 16;
salt_buf1[0] = salt_bufs[SALT_POS].salt_buf[ 3] >> 16 | salt_bufs[SALT_POS].salt_buf[ 4] << 16;
salt_buf1[1] = salt_bufs[SALT_POS].salt_buf[ 4] >> 16 | salt_bufs[SALT_POS].salt_buf[ 5] << 16;
salt_buf1[2] = salt_bufs[SALT_POS].salt_buf[ 5] >> 16 | salt_bufs[SALT_POS].salt_buf[ 6] << 16;
salt_buf1[3] = salt_bufs[SALT_POS].salt_buf[ 6] >> 16 | salt_bufs[SALT_POS].salt_buf[ 7] << 16;
salt_buf2[0] = salt_bufs[SALT_POS].salt_buf[ 7] >> 16 | salt_bufs[SALT_POS].salt_buf[ 8] << 16;
salt_buf2[1] = salt_bufs[SALT_POS].salt_buf[ 8] >> 16 | salt_bufs[SALT_POS].salt_buf[ 9] << 16;
salt_buf2[2] = salt_bufs[SALT_POS].salt_buf[ 9] >> 16 | 0x2d2d0000;
salt_buf2[3] = 0;
const u32 salt_len = 44;
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* reverse
*/
const u32 e_rev = hc_rotl32_S (search[1], 2u);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
u32x w0[4] = { 0 };
u32x w1[4] = { 0 };
u32x w2[4] = { 0 };
u32x w3[4] = { 0 };
const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
/**
* prepend salt
*/
const u32x out_salt_len = out_len + salt_len;
switch_buffer_by_offset_le_VV (w0, w1, w2, w3, salt_len);
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
w0[2] |= salt_buf0[2];
w0[3] |= salt_buf0[3];
w1[0] |= salt_buf1[0];
w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3];
w2[0] |= salt_buf2[0];
w2[1] |= salt_buf2[1];
w2[2] |= salt_buf2[2];
w2[3] |= salt_buf2[3];
w3[0] |= salt_buf3[0];
w3[1] |= salt_buf3[1];
w3[2] |= salt_buf3[2];
w3[3] |= salt_buf3[3];
append_0x2d_4x4_VV (w0, w1, w2, w3, out_salt_len);
append_0x2d_4x4_VV (w0, w1, w2, w3, out_salt_len + 1);
append_0x80_4x4_VV (w0, w1, w2, w3, out_salt_len + 2);
/**
* sha1
*/
u32x w0_t = hc_swap32 (w0[0]);
u32x w1_t = hc_swap32 (w0[1]);
u32x w2_t = hc_swap32 (w0[2]);
u32x w3_t = hc_swap32 (w0[3]);
u32x w4_t = hc_swap32 (w1[0]);
u32x w5_t = hc_swap32 (w1[1]);
u32x w6_t = hc_swap32 (w1[2]);
u32x w7_t = hc_swap32 (w1[3]);
u32x w8_t = hc_swap32 (w2[0]);
u32x w9_t = hc_swap32 (w2[1]);
u32x wa_t = hc_swap32 (w2[2]);
u32x wb_t = hc_swap32 (w2[3]);
u32x wc_t = hc_swap32 (w3[0]);
u32x wd_t = hc_swap32 (w3[1]);
u32x we_t = 0;
u32x wf_t = (out_salt_len + 2) * 8;
u32x a = SHA1M_A;
u32x b = SHA1M_B;
u32x c = SHA1M_C;
u32x d = SHA1M_D;
u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
#undef K
#define K SHA1C01
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
#undef K
#define K SHA1C02
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
#undef K
#define K SHA1C03
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_S_SIMD (d, e, c, b);
}
}
KERNEL_FQ void m27200_s08 (KERN_ATTR_RULES ())
{
}
KERNEL_FQ void m27200_s16 (KERN_ATTR_RULES ())
{
}

@ -0,0 +1,142 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_rp.h"
#include "inc_rp.cl"
#include "inc_scalar.cl"
#include "inc_hash_sha1.cl"
#endif
KERNEL_FQ void m27200_mxx (KERN_ATTR_RULES ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
const u32 dash[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
/**
* base
*/
COPY_PW (pws[gid]);
sha1_ctx_t ctx0;
sha1_init (&ctx0);
ctx0.w0[0] = dash[0];
ctx0.w0[1] = dash[1];
ctx0.len = 2;
sha1_update_global_swap (&ctx0, salt_bufs[SALT_POS].salt_buf, salt_bufs[SALT_POS].salt_len);
sha1_update (&ctx0, dash, 2);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos++)
{
pw_t tmp = PASTE_PW;
tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
sha1_ctx_t ctx = ctx0;
sha1_update_swap (&ctx, tmp.i, tmp.pw_len);
sha1_update (&ctx, dash, 2);
sha1_final (&ctx);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
COMPARE_M_SCALAR (r0, r1, r2, r3);
}
}
KERNEL_FQ void m27200_sxx (KERN_ATTR_RULES ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
const u32 dash[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* base
*/
COPY_PW (pws[gid]);
sha1_ctx_t ctx0;
sha1_init (&ctx0);
ctx0.w0[0] = dash[0];
ctx0.w0[1] = dash[1];
ctx0.len = 2;
sha1_update_global_swap (&ctx0, salt_bufs[SALT_POS].salt_buf, salt_bufs[SALT_POS].salt_len);
sha1_update (&ctx0, dash, 2);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos++)
{
pw_t tmp = PASTE_PW;
tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
sha1_ctx_t ctx = ctx0;
sha1_update_swap (&ctx, tmp.i, tmp.pw_len);
sha1_update (&ctx, dash, 2);
sha1_final (&ctx);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
COMPARE_S_SCALAR (r0, r1, r2, r3);
}
}

@ -0,0 +1,630 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha1.cl"
#endif
KERNEL_FQ void m27200_m04 (KERN_ATTR_BASIC ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 pw_buf0[4];
u32 pw_buf1[4];
pw_buf0[0] = pws[gid].i[0];
pw_buf0[1] = pws[gid].i[1];
pw_buf0[2] = pws[gid].i[2];
pw_buf0[3] = pws[gid].i[3];
pw_buf1[0] = pws[gid].i[4];
pw_buf1[1] = pws[gid].i[5];
pw_buf1[2] = pws[gid].i[6];
pw_buf1[3] = pws[gid].i[7];
const u32 pw_l_len = pws[gid].pw_len & 63;
/**
* salt
*/
u32 salt_buf0[4] = { 0x2d2d, 0, 0, 0 };
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4] = { 0 };
salt_buf0[0] |= salt_bufs[SALT_POS].salt_buf[ 0] << 16;
salt_buf0[1] = salt_bufs[SALT_POS].salt_buf[ 0] >> 16 | salt_bufs[SALT_POS].salt_buf[ 1] << 16;
salt_buf0[2] = salt_bufs[SALT_POS].salt_buf[ 1] >> 16 | salt_bufs[SALT_POS].salt_buf[ 2] << 16;
salt_buf0[3] = salt_bufs[SALT_POS].salt_buf[ 2] >> 16 | salt_bufs[SALT_POS].salt_buf[ 3] << 16;
salt_buf1[0] = salt_bufs[SALT_POS].salt_buf[ 3] >> 16 | salt_bufs[SALT_POS].salt_buf[ 4] << 16;
salt_buf1[1] = salt_bufs[SALT_POS].salt_buf[ 4] >> 16 | salt_bufs[SALT_POS].salt_buf[ 5] << 16;
salt_buf1[2] = salt_bufs[SALT_POS].salt_buf[ 5] >> 16 | salt_bufs[SALT_POS].salt_buf[ 6] << 16;
salt_buf1[3] = salt_bufs[SALT_POS].salt_buf[ 6] >> 16 | salt_bufs[SALT_POS].salt_buf[ 7] << 16;
salt_buf2[0] = salt_bufs[SALT_POS].salt_buf[ 7] >> 16 | salt_bufs[SALT_POS].salt_buf[ 8] << 16;
salt_buf2[1] = salt_bufs[SALT_POS].salt_buf[ 8] >> 16 | salt_bufs[SALT_POS].salt_buf[ 9] << 16;
salt_buf2[2] = salt_bufs[SALT_POS].salt_buf[ 9] >> 16 | 0x2d2d0000;
salt_buf2[3] = 0;
const u32 salt_len = 44;
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
const u32x pw_len = (pw_l_len + pw_r_len) & 63;
/**
* concat password candidate
*/
u32x wordl0[4] = { 0 };
u32x wordl1[4] = { 0 };
u32x wordl2[4] = { 0 };
u32x wordl3[4] = { 0 };
wordl0[0] = pw_buf0[0];
wordl0[1] = pw_buf0[1];
wordl0[2] = pw_buf0[2];
wordl0[3] = pw_buf0[3];
wordl1[0] = pw_buf1[0];
wordl1[1] = pw_buf1[1];
wordl1[2] = pw_buf1[2];
wordl1[3] = pw_buf1[3];
u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
else
{
switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
}
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3];
w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3];
w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3];
w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3];
/**
* prepend salt
*/
switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
const u32x pw_salt_len = pw_len + salt_len;
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
w0[2] |= salt_buf0[2];
w0[3] |= salt_buf0[3];
w1[0] |= salt_buf1[0];
w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3];
w2[0] |= salt_buf2[0];
w2[1] |= salt_buf2[1];
w2[2] |= salt_buf2[2];
w2[3] |= salt_buf2[3];
w3[0] |= salt_buf3[0];
w3[1] |= salt_buf3[1];
w3[2] |= salt_buf3[2];
w3[3] |= salt_buf3[3];
append_0x2d_4x4_VV (w0, w1, w2, w3, pw_salt_len);
append_0x2d_4x4_VV (w0, w1, w2, w3, pw_salt_len + 1);
append_0x80_4x4_VV (w0, w1, w2, w3, pw_salt_len + 2);
/**
* sha1
*/
u32x w0_t = hc_swap32 (w0[0]);
u32x w1_t = hc_swap32 (w0[1]);
u32x w2_t = hc_swap32 (w0[2]);
u32x w3_t = hc_swap32 (w0[3]);
u32x w4_t = hc_swap32 (w1[0]);
u32x w5_t = hc_swap32 (w1[1]);
u32x w6_t = hc_swap32 (w1[2]);
u32x w7_t = hc_swap32 (w1[3]);
u32x w8_t = hc_swap32 (w2[0]);
u32x w9_t = hc_swap32 (w2[1]);
u32x wa_t = hc_swap32 (w2[2]);
u32x wb_t = hc_swap32 (w2[3]);
u32x wc_t = hc_swap32 (w3[0]);
u32x wd_t = hc_swap32 (w3[1]);
u32x we_t = 0;
u32x wf_t = (pw_salt_len + 2) * 8;
u32x a = SHA1M_A;
u32x b = SHA1M_B;
u32x c = SHA1M_C;
u32x d = SHA1M_D;
u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
#undef K
#define K SHA1C01
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
#undef K
#define K SHA1C02
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
#undef K
#define K SHA1C03
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_M_SIMD (d, e, c, b);
}
}
KERNEL_FQ void m27200_m08 (KERN_ATTR_BASIC ())
{
}
KERNEL_FQ void m27200_m16 (KERN_ATTR_BASIC ())
{
}
KERNEL_FQ void m27200_s04 (KERN_ATTR_BASIC ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 pw_buf0[4];
u32 pw_buf1[4];
pw_buf0[0] = pws[gid].i[0];
pw_buf0[1] = pws[gid].i[1];
pw_buf0[2] = pws[gid].i[2];
pw_buf0[3] = pws[gid].i[3];
pw_buf1[0] = pws[gid].i[4];
pw_buf1[1] = pws[gid].i[5];
pw_buf1[2] = pws[gid].i[6];
pw_buf1[3] = pws[gid].i[7];
const u32 pw_l_len = pws[gid].pw_len & 63;
/**
* salt
*/
u32 salt_buf0[4] = { 0x2d2d, 0, 0, 0 };
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4] = { 0 };
salt_buf0[0] |= salt_bufs[SALT_POS].salt_buf[ 0] << 16;
salt_buf0[1] = salt_bufs[SALT_POS].salt_buf[ 0] >> 16 | salt_bufs[SALT_POS].salt_buf[ 1] << 16;
salt_buf0[2] = salt_bufs[SALT_POS].salt_buf[ 1] >> 16 | salt_bufs[SALT_POS].salt_buf[ 2] << 16;
salt_buf0[3] = salt_bufs[SALT_POS].salt_buf[ 2] >> 16 | salt_bufs[SALT_POS].salt_buf[ 3] << 16;
salt_buf1[0] = salt_bufs[SALT_POS].salt_buf[ 3] >> 16 | salt_bufs[SALT_POS].salt_buf[ 4] << 16;
salt_buf1[1] = salt_bufs[SALT_POS].salt_buf[ 4] >> 16 | salt_bufs[SALT_POS].salt_buf[ 5] << 16;
salt_buf1[2] = salt_bufs[SALT_POS].salt_buf[ 5] >> 16 | salt_bufs[SALT_POS].salt_buf[ 6] << 16;
salt_buf1[3] = salt_bufs[SALT_POS].salt_buf[ 6] >> 16 | salt_bufs[SALT_POS].salt_buf[ 7] << 16;
salt_buf2[0] = salt_bufs[SALT_POS].salt_buf[ 7] >> 16 | salt_bufs[SALT_POS].salt_buf[ 8] << 16;
salt_buf2[1] = salt_bufs[SALT_POS].salt_buf[ 8] >> 16 | salt_bufs[SALT_POS].salt_buf[ 9] << 16;
salt_buf2[2] = salt_bufs[SALT_POS].salt_buf[ 9] >> 16 | 0x2d2d0000;
salt_buf2[3] = 0;
const u32 salt_len = 44;
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* reverse
*/
const u32 e_rev = hc_rotl32_S (search[1], 2u);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
const u32x pw_len = (pw_l_len + pw_r_len) & 63;
/**
* concat password candidate
*/
u32x wordl0[4] = { 0 };
u32x wordl1[4] = { 0 };
u32x wordl2[4] = { 0 };
u32x wordl3[4] = { 0 };
wordl0[0] = pw_buf0[0];
wordl0[1] = pw_buf0[1];
wordl0[2] = pw_buf0[2];
wordl0[3] = pw_buf0[3];
wordl1[0] = pw_buf1[0];
wordl1[1] = pw_buf1[1];
wordl1[2] = pw_buf1[2];
wordl1[3] = pw_buf1[3];
u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
else
{
switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
}
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3];
w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3];
w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3];
w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3];
/**
* prepend salt
*/
switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
const u32x pw_salt_len = pw_len + salt_len;
w0[0] |= salt_buf0[0];
w0[1] |= salt_buf0[1];
w0[2] |= salt_buf0[2];
w0[3] |= salt_buf0[3];
w1[0] |= salt_buf1[0];
w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3];
w2[0] |= salt_buf2[0];
w2[1] |= salt_buf2[1];
w2[2] |= salt_buf2[2];
w2[3] |= salt_buf2[3];
w3[0] |= salt_buf3[0];
w3[1] |= salt_buf3[1];
w3[2] |= salt_buf3[2];
w3[3] |= salt_buf3[3];
append_0x2d_4x4_VV (w0, w1, w2, w3, pw_salt_len);
append_0x2d_4x4_VV (w0, w1, w2, w3, pw_salt_len + 1);
append_0x80_4x4_VV (w0, w1, w2, w3, pw_salt_len + 2);
/**
* sha1
*/
u32x w0_t = hc_swap32 (w0[0]);
u32x w1_t = hc_swap32 (w0[1]);
u32x w2_t = hc_swap32 (w0[2]);
u32x w3_t = hc_swap32 (w0[3]);
u32x w4_t = hc_swap32 (w1[0]);
u32x w5_t = hc_swap32 (w1[1]);
u32x w6_t = hc_swap32 (w1[2]);
u32x w7_t = hc_swap32 (w1[3]);
u32x w8_t = hc_swap32 (w2[0]);
u32x w9_t = hc_swap32 (w2[1]);
u32x wa_t = hc_swap32 (w2[2]);
u32x wb_t = hc_swap32 (w2[3]);
u32x wc_t = hc_swap32 (w3[0]);
u32x wd_t = hc_swap32 (w3[1]);
u32x we_t = 0;
u32x wf_t = (pw_salt_len + 2) * 8;
u32x a = SHA1M_A;
u32x b = SHA1M_B;
u32x c = SHA1M_C;
u32x d = SHA1M_D;
u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
#undef K
#define K SHA1C01
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
#undef K
#define K SHA1C02
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
#undef K
#define K SHA1C03
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_S_SIMD (d, e, c, b);
}
}
KERNEL_FQ void m27200_s08 (KERN_ATTR_BASIC ())
{
}
KERNEL_FQ void m27200_s16 (KERN_ATTR_BASIC ())
{
}

@ -0,0 +1,131 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_scalar.cl"
#include "inc_hash_sha1.cl"
#endif
KERNEL_FQ void m27200_mxx (KERN_ATTR_BASIC ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
const u32 dash[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
/**
* base
*/
sha1_ctx_t ctx0;
sha1_init (&ctx0);
ctx0.w0[0] = dash[0];
ctx0.w0[1] = dash[1];
ctx0.len = 2;
sha1_update_global_swap (&ctx0, salt_bufs[SALT_POS].salt_buf, salt_bufs[SALT_POS].salt_len);
sha1_update (&ctx0, dash, 2);
sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos++)
{
sha1_ctx_t ctx = ctx0;
sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
sha1_update (&ctx, dash, 2);
sha1_final (&ctx);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
COMPARE_M_SCALAR (r0, r1, r2, r3);
}
}
KERNEL_FQ void m27200_sxx (KERN_ATTR_BASIC ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
const u32 dash[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* base
*/
sha1_ctx_t ctx0;
sha1_init (&ctx0);
ctx0.w0[0] = dash[0];
ctx0.w0[1] = dash[1];
ctx0.len = 2;
sha1_update_global_swap (&ctx0, salt_bufs[SALT_POS].salt_buf, salt_bufs[SALT_POS].salt_len);
sha1_update (&ctx0, dash, 2);
sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos++)
{
sha1_ctx_t ctx = ctx0;
sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
sha1_update (&ctx, dash, 2);
sha1_final (&ctx);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
COMPARE_S_SCALAR (r0, r1, r2, r3);
}
}

@ -0,0 +1,828 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha1.cl"
#endif
DECLSPEC void append_4 (const u32 offset, u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 src_r0)
{
u32 tmp[2];
switch (offset & 3)
{
case 0: tmp[0] = src_r0;
tmp[1] = 0;
break;
case 1: tmp[0] = src_r0 >> 8;
tmp[1] = src_r0 << 24;
break;
case 2: tmp[0] = src_r0 >> 16;
tmp[1] = src_r0 << 16;
break;
case 3: tmp[0] = src_r0 >> 24;
tmp[1] = src_r0 << 8;
break;
}
switch (offset / 4)
{
case 0: w0[0] |= tmp[0];
w0[1] = tmp[1];
break;
case 1: w0[1] |= tmp[0];
w0[2] = tmp[1];
break;
case 2: w0[2] |= tmp[0];
w0[3] = tmp[1];
break;
case 3: w0[3] |= tmp[0];
w1[0] = tmp[1];
break;
case 4: w1[0] |= tmp[0];
w1[1] = tmp[1];
break;
case 5: w1[1] |= tmp[0];
w1[2] = tmp[1];
break;
case 6: w1[2] |= tmp[0];
w1[3] = tmp[1];
break;
case 7: w1[3] |= tmp[0];
w2[0] = tmp[1];
break;
case 8: w2[0] |= tmp[0];
w2[1] = tmp[1];
break;
case 9: w2[1] |= tmp[0];
w2[2] = tmp[1];
break;
case 10: w2[2] |= tmp[0];
w2[3] = tmp[1];
break;
case 11: w2[3] |= tmp[0];
w3[0] = tmp[1];
break;
case 12: w3[0] |= tmp[0];
w3[1] = tmp[1];
break;
case 13: w3[1] |= tmp[0];
w3[2] = tmp[1];
break;
case 14: w3[2] |= tmp[0];
w3[3] = tmp[1];
break;
case 15: w3[3] |= tmp[0];
break;
}
}
DECLSPEC void m27200m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC ())
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
const u64 lid = get_local_id (0);
const u32 dash_stop = 0x2d2d8000;
/**
* salt
*/
u32 salt_buf0[4] = { 0x2d2d0000, 0, 0, 0 };
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4] = { 0 };
salt_buf0[0] |= hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 0] << 16);
salt_buf0[1] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 0] >> 16 | salt_bufs[SALT_POS].salt_buf[ 1] << 16);
salt_buf0[2] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 1] >> 16 | salt_bufs[SALT_POS].salt_buf[ 2] << 16);
salt_buf0[3] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 2] >> 16 | salt_bufs[SALT_POS].salt_buf[ 3] << 16);
salt_buf1[0] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 3] >> 16 | salt_bufs[SALT_POS].salt_buf[ 4] << 16);
salt_buf1[1] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 4] >> 16 | salt_bufs[SALT_POS].salt_buf[ 5] << 16);
salt_buf1[2] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 5] >> 16 | salt_bufs[SALT_POS].salt_buf[ 6] << 16);
salt_buf1[3] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 6] >> 16 | salt_bufs[SALT_POS].salt_buf[ 7] << 16);
salt_buf2[0] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 7] >> 16 | salt_bufs[SALT_POS].salt_buf[ 8] << 16);
salt_buf2[1] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 8] >> 16 | salt_bufs[SALT_POS].salt_buf[ 9] << 16);
salt_buf2[2] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 9] >> 16 | 0x2d2d0000);
salt_buf2[3] = 0;
const u32 salt_len = 44;
append_4 (pw_len, w0, w1, w2, w3, dash_stop);
const u32 pw_salt_len = pw_len + 2 + salt_len;
/**
* loop
*/
const u32 w0l = w0[0];
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
u32x t0[4];
u32x t1[4];
u32x t2[4];
u32x t3[4];
t0[0] = w0lr;
t0[1] = w0[1];
t0[2] = w0[2];
t0[3] = w0[3];
t1[0] = w1[0];
t1[1] = w1[1];
t1[2] = w1[2];
t1[3] = w1[3];
t2[0] = w2[0];
t2[1] = w2[1];
t2[2] = w2[2];
t2[3] = w2[3];
t3[0] = w3[0];
t3[1] = w3[1];
t3[2] = w3[2];
t3[3] = w3[3];
switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len);
t0[0] |= salt_buf0[0];
t0[1] |= salt_buf0[1];
t0[2] |= salt_buf0[2];
t0[3] |= salt_buf0[3];
t1[0] |= salt_buf1[0];
t1[1] |= salt_buf1[1];
t1[2] |= salt_buf1[2];
t1[3] |= salt_buf1[3];
t2[0] |= salt_buf2[0];
t2[1] |= salt_buf2[1];
t2[2] |= salt_buf2[2];
t2[3] |= salt_buf2[3];
t3[0] |= salt_buf3[0];
t3[1] |= salt_buf3[1];
t3[2] = 0;
t3[3] = (pw_salt_len) * 8;
/**
* sha1
*/
u32x w0_t = t0[0];
u32x w1_t = t0[1];
u32x w2_t = t0[2];
u32x w3_t = t0[3];
u32x w4_t = t1[0];
u32x w5_t = t1[1];
u32x w6_t = t1[2];
u32x w7_t = t1[3];
u32x w8_t = t2[0];
u32x w9_t = t2[1];
u32x wa_t = t2[2];
u32x wb_t = t2[3];
u32x wc_t = t3[0];
u32x wd_t = t3[1];
u32x we_t = 0;
u32x wf_t = (pw_salt_len) * 8;
u32x a = SHA1M_A;
u32x b = SHA1M_B;
u32x c = SHA1M_C;
u32x d = SHA1M_D;
u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
#undef K
#define K SHA1C01
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
#undef K
#define K SHA1C02
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
#undef K
#define K SHA1C03
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_M_SIMD (d, e, c, b);
}
}
DECLSPEC void m27200s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KERN_ATTR_BASIC ())
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
const u64 lid = get_local_id (0);
const u32 dash_stop = 0x2d2d8000;
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* reverse
*/
const u32 e_rev = hc_rotl32_S (search[1], 2u);
/**
* salt
*/
u32 salt_buf0[4] = { 0x2d2d0000, 0, 0, 0 };
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4] = { 0 };
salt_buf0[0] |= hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 0] << 16);
salt_buf0[1] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 0] >> 16 | salt_bufs[SALT_POS].salt_buf[ 1] << 16);
salt_buf0[2] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 1] >> 16 | salt_bufs[SALT_POS].salt_buf[ 2] << 16);
salt_buf0[3] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 2] >> 16 | salt_bufs[SALT_POS].salt_buf[ 3] << 16);
salt_buf1[0] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 3] >> 16 | salt_bufs[SALT_POS].salt_buf[ 4] << 16);
salt_buf1[1] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 4] >> 16 | salt_bufs[SALT_POS].salt_buf[ 5] << 16);
salt_buf1[2] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 5] >> 16 | salt_bufs[SALT_POS].salt_buf[ 6] << 16);
salt_buf1[3] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 6] >> 16 | salt_bufs[SALT_POS].salt_buf[ 7] << 16);
salt_buf2[0] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 7] >> 16 | salt_bufs[SALT_POS].salt_buf[ 8] << 16);
salt_buf2[1] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 8] >> 16 | salt_bufs[SALT_POS].salt_buf[ 9] << 16);
salt_buf2[2] = hc_swap32_S (salt_bufs[SALT_POS].salt_buf[ 9] >> 16 | 0x2d2d0000);
salt_buf2[3] = 0;
const u32 salt_len = 44;
append_4 (pw_len, w0, w1, w2, w3, dash_stop);
const u32 pw_salt_len = pw_len + 2 + salt_len;
/**
* loop
*/
const u32 w0l = w0[0];
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
u32x t0[4];
u32x t1[4];
u32x t2[4];
u32x t3[4];
t0[0] = w0lr;
t0[1] = w0[1];
t0[2] = w0[2];
t0[3] = w0[3];
t1[0] = w1[0];
t1[1] = w1[1];
t1[2] = w1[2];
t1[3] = w1[3];
t2[0] = w2[0];
t2[1] = w2[1];
t2[2] = w2[2];
t2[3] = w2[3];
t3[0] = w3[0];
t3[1] = w3[1];
t3[2] = w3[2];
t3[3] = w3[3];
switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len);
t0[0] |= salt_buf0[0];
t0[1] |= salt_buf0[1];
t0[2] |= salt_buf0[2];
t0[3] |= salt_buf0[3];
t1[0] |= salt_buf1[0];
t1[1] |= salt_buf1[1];
t1[2] |= salt_buf1[2];
t1[3] |= salt_buf1[3];
t2[0] |= salt_buf2[0];
t2[1] |= salt_buf2[1];
t2[2] |= salt_buf2[2];
t2[3] |= salt_buf2[3];
t3[0] |= salt_buf3[0];
t3[1] |= salt_buf3[1];
t3[2] = 0;
t3[3] = pw_salt_len * 8;
/**
* sha1
*/
u32x w0_t = t0[0];
u32x w1_t = t0[1];
u32x w2_t = t0[2];
u32x w3_t = t0[3];
u32x w4_t = t1[0];
u32x w5_t = t1[1];
u32x w6_t = t1[2];
u32x w7_t = t1[3];
u32x w8_t = t2[0];
u32x w9_t = t2[1];
u32x wa_t = t2[2];
u32x wb_t = t2[3];
u32x wc_t = t3[0];
u32x wd_t = t3[1];
u32x we_t = 0;
u32x wf_t = pw_salt_len * 8;
u32x a = SHA1M_A;
u32x b = SHA1M_B;
u32x c = SHA1M_C;
u32x d = SHA1M_D;
u32x e = SHA1M_E;
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t);
SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t);
SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t);
SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t);
SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t);
SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t);
#undef K
#define K SHA1C01
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t);
#undef K
#define K SHA1C02
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t);
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t);
#undef K
#define K SHA1C03
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t);
w0_t = hc_rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t);
w1_t = hc_rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t);
w2_t = hc_rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t);
w3_t = hc_rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t);
w4_t = hc_rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t);
w5_t = hc_rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t);
w6_t = hc_rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t);
w7_t = hc_rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t);
w8_t = hc_rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t);
w9_t = hc_rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t);
wa_t = hc_rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = hc_rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = hc_rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = hc_rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = hc_rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = hc_rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_S_SIMD (d, e, c, b);
}
}
KERNEL_FQ void m27200_m04 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
u32 w2[4];
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
u32 w3[4];
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m27200m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, SALT_POS, loop_pos, loop_cnt, il_cnt, digests_cnt, DIGESTS_OFFSET, combs_mode, salt_repeat, pws_pos, gid_max);
}
KERNEL_FQ void m27200_m08 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = pws[gid].i[ 4];
w1[1] = pws[gid].i[ 5];
w1[2] = pws[gid].i[ 6];
w1[3] = pws[gid].i[ 7];
u32 w2[4];
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
u32 w3[4];
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m27200m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, SALT_POS, loop_pos, loop_cnt, il_cnt, digests_cnt, DIGESTS_OFFSET, combs_mode, salt_repeat, pws_pos, gid_max);
}
KERNEL_FQ void m27200_m16 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = pws[gid].i[ 4];
w1[1] = pws[gid].i[ 5];
w1[2] = pws[gid].i[ 6];
w1[3] = pws[gid].i[ 7];
u32 w2[4];
w2[0] = pws[gid].i[ 8];
w2[1] = pws[gid].i[ 9];
w2[2] = pws[gid].i[10];
w2[3] = pws[gid].i[11];
u32 w3[4];
w3[0] = pws[gid].i[12];
w3[1] = pws[gid].i[13];
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m27200m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, SALT_POS, loop_pos, loop_cnt, il_cnt, digests_cnt, DIGESTS_OFFSET, combs_mode, salt_repeat, pws_pos, gid_max);
}
KERNEL_FQ void m27200_s04 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
u32 w2[4];
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
u32 w3[4];
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m27200s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, SALT_POS, loop_pos, loop_cnt, il_cnt, digests_cnt, DIGESTS_OFFSET, combs_mode, salt_repeat, pws_pos, gid_max);
}
KERNEL_FQ void m27200_s08 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = pws[gid].i[ 4];
w1[1] = pws[gid].i[ 5];
w1[2] = pws[gid].i[ 6];
w1[3] = pws[gid].i[ 7];
u32 w2[4];
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
u32 w3[4];
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m27200s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, SALT_POS, loop_pos, loop_cnt, il_cnt, digests_cnt, DIGESTS_OFFSET, combs_mode, salt_repeat, pws_pos, gid_max);
}
KERNEL_FQ void m27200_s16 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = pws[gid].i[ 4];
w1[1] = pws[gid].i[ 5];
w1[2] = pws[gid].i[ 6];
w1[3] = pws[gid].i[ 7];
u32 w2[4];
w2[0] = pws[gid].i[ 8];
w2[1] = pws[gid].i[ 9];
w2[2] = pws[gid].i[10];
w2[3] = pws[gid].i[11];
u32 w3[4];
w3[0] = pws[gid].i[12];
w3[1] = pws[gid].i[13];
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m27200s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, SALT_POS, loop_pos, loop_cnt, il_cnt, digests_cnt, DIGESTS_OFFSET, combs_mode, salt_repeat, pws_pos, gid_max);
}

@ -0,0 +1,169 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha1.cl"
#endif
KERNEL_FQ void m27200_mxx (KERN_ATTR_VECTOR ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
const u32x dash_vector[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
const u32 dash_scalar[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32x w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
sha1_ctx_t ctx0;
sha1_init (&ctx0);
ctx0.w0[0] = dash_scalar[0];
ctx0.w0[1] = dash_scalar[1];
ctx0.len = 2;
sha1_update_global_swap (&ctx0, salt_bufs[SALT_POS].salt_buf, salt_bufs[SALT_POS].salt_len);
sha1_update (&ctx0, dash_scalar, 2);
/**
* loop
*/
u32x w0l = w[0];
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32x w0 = w0l | w0r;
w[0] = w0;
sha1_ctx_vector_t ctx;
sha1_init_vector_from_scalar (&ctx, &ctx0);
sha1_update_vector (&ctx, w, pw_len);
sha1_update_vector (&ctx, dash_vector, 2);
sha1_final_vector (&ctx);
const u32x r0 = ctx.h[DGST_R0];
const u32x r1 = ctx.h[DGST_R1];
const u32x r2 = ctx.h[DGST_R2];
const u32x r3 = ctx.h[DGST_R3];
COMPARE_M_SIMD (r0, r1, r2, r3);
}
}
KERNEL_FQ void m27200_sxx (KERN_ATTR_VECTOR ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
const u32x dash_vector[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
const u32 dash_scalar[16] = { 0x2d2d0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32x w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
sha1_ctx_t ctx0;
sha1_init (&ctx0);
ctx0.w0[0] = dash_scalar[0];
ctx0.w0[1] = dash_scalar[1];
ctx0.len = 2;
sha1_update_global_swap (&ctx0, salt_bufs[SALT_POS].salt_buf, salt_bufs[SALT_POS].salt_len);
sha1_update (&ctx0, dash_scalar, 2);
/**
* loop
*/
u32x w0l = w[0];
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32x w0 = w0l | w0r;
w[0] = w0;
sha1_ctx_vector_t ctx;
sha1_init_vector_from_scalar (&ctx, &ctx0);
sha1_update_vector (&ctx, w, pw_len);
sha1_update_vector (&ctx, dash_vector, 2);
sha1_final_vector (&ctx);
const u32x r0 = ctx.h[DGST_R0];
const u32x r1 = ctx.h[DGST_R1];
const u32x r2 = ctx.h[DGST_R2];
const u32x r3 = ctx.h[DGST_R3];
COMPARE_S_SIMD (r0, r1, r2, r3);
}
}

@ -0,0 +1,495 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha512.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 34
#define SNMPV3_MSG_AUTH_PARAMS_MAX 48
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 128
#define SNMPV3_TMP_ELEMS 8192 // 8192 = (256 (max pw length) * SNMPV3_MAX_PW_LENGTH) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 8
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of SNMPV3_MAX_PW_LENGTH
#define SNMPV3_MAX_ENGINE_ELEMS 32 // 32 * 4 = 128 > 34, also has to be multiple of SNMPV3_MAX_PW_LENGTH
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
#define SNMPV3_MAX_PW_LENGTH_OPT 32
#define SNMPV3_TMP_ELEMS_OPT ((SNMPV3_MAX_PW_LENGTH_OPT * SNMPV3_MAX_PW_LENGTH) / 4)
// (32 * 128) / 4 = 1024
// for pw length > 32 we use global memory reads
typedef struct hmac_sha512_tmp
{
u32 tmp[SNMPV3_TMP_ELEMS];
u64 h[SNMPV3_HASH_ELEMS];
} hmac_sha512_tmp_t;
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
KERNEL_FQ void m27300_init (KERN_ATTR_TMPS_ESALT (hmac_sha512_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
/**
* base
*/
const u32 pw_len = pws[gid].pw_len;
u32 w[128] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u8 *src_ptr = (u8 *) w;
// password 128 times, also swapped
u32 dst_buf[32];
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 128; i++)
{
for (u32 j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 127;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 127)
{
const int tmp_idx4 = (tmp_idx - 127) / 4;
tmps[gid].tmp[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]);
tmps[gid].tmp[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]);
tmps[gid].tmp[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]);
tmps[gid].tmp[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]);
tmps[gid].tmp[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]);
tmps[gid].tmp[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]);
tmps[gid].tmp[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]);
tmps[gid].tmp[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]);
tmps[gid].tmp[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]);
tmps[gid].tmp[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]);
tmps[gid].tmp[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]);
tmps[gid].tmp[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]);
tmps[gid].tmp[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]);
tmps[gid].tmp[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]);
tmps[gid].tmp[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]);
tmps[gid].tmp[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]);
tmps[gid].tmp[tmp_idx4 + 16] = hc_swap32_S (dst_buf[16]);
tmps[gid].tmp[tmp_idx4 + 17] = hc_swap32_S (dst_buf[17]);
tmps[gid].tmp[tmp_idx4 + 18] = hc_swap32_S (dst_buf[18]);
tmps[gid].tmp[tmp_idx4 + 19] = hc_swap32_S (dst_buf[19]);
tmps[gid].tmp[tmp_idx4 + 20] = hc_swap32_S (dst_buf[20]);
tmps[gid].tmp[tmp_idx4 + 21] = hc_swap32_S (dst_buf[21]);
tmps[gid].tmp[tmp_idx4 + 22] = hc_swap32_S (dst_buf[22]);
tmps[gid].tmp[tmp_idx4 + 23] = hc_swap32_S (dst_buf[23]);
tmps[gid].tmp[tmp_idx4 + 24] = hc_swap32_S (dst_buf[24]);
tmps[gid].tmp[tmp_idx4 + 25] = hc_swap32_S (dst_buf[25]);
tmps[gid].tmp[tmp_idx4 + 26] = hc_swap32_S (dst_buf[26]);
tmps[gid].tmp[tmp_idx4 + 27] = hc_swap32_S (dst_buf[27]);
tmps[gid].tmp[tmp_idx4 + 28] = hc_swap32_S (dst_buf[28]);
tmps[gid].tmp[tmp_idx4 + 29] = hc_swap32_S (dst_buf[29]);
tmps[gid].tmp[tmp_idx4 + 30] = hc_swap32_S (dst_buf[30]);
tmps[gid].tmp[tmp_idx4 + 31] = hc_swap32_S (dst_buf[31]);
}
tmp_idx++;
}
}
// hash
tmps[gid].h[0] = SHA512M_A;
tmps[gid].h[1] = SHA512M_B;
tmps[gid].h[2] = SHA512M_C;
tmps[gid].h[3] = SHA512M_D;
tmps[gid].h[4] = SHA512M_E;
tmps[gid].h[5] = SHA512M_F;
tmps[gid].h[6] = SHA512M_G;
tmps[gid].h[7] = SHA512M_H;
}
KERNEL_FQ void m27300_loop (KERN_ATTR_TMPS_ESALT (hmac_sha512_tmp_t, snmpv3_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u64 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
const u32 pw_len = pws[gid].pw_len;
const int pw_len128 = pw_len * 128;
if (pw_len <= SNMPV3_MAX_PW_LENGTH_OPT)
{
u32 tmp[SNMPV3_TMP_ELEMS_OPT];
for (int i = 0; i < pw_len128 / 4; i++)
{
tmp[i] = tmps[gid].tmp[i];
}
for (u32 i = 0, j = loop_pos; i < loop_cnt; i += 128, j += 128)
{
const int idx = (j % pw_len128) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
w0[0] = tmp[idx + 0];
w0[1] = tmp[idx + 1];
w0[2] = tmp[idx + 2];
w0[3] = tmp[idx + 3];
w1[0] = tmp[idx + 4];
w1[1] = tmp[idx + 5];
w1[2] = tmp[idx + 6];
w1[3] = tmp[idx + 7];
w2[0] = tmp[idx + 8];
w2[1] = tmp[idx + 9];
w2[2] = tmp[idx + 10];
w2[3] = tmp[idx + 11];
w3[0] = tmp[idx + 12];
w3[1] = tmp[idx + 13];
w3[2] = tmp[idx + 14];
w3[3] = tmp[idx + 15];
w4[0] = tmp[idx + 16];
w4[1] = tmp[idx + 17];
w4[2] = tmp[idx + 18];
w4[3] = tmp[idx + 19];
w5[0] = tmp[idx + 20];
w5[1] = tmp[idx + 21];
w5[2] = tmp[idx + 22];
w5[3] = tmp[idx + 23];
w6[0] = tmp[idx + 24];
w6[1] = tmp[idx + 25];
w6[2] = tmp[idx + 26];
w6[3] = tmp[idx + 27];
w7[0] = tmp[idx + 28];
w7[1] = tmp[idx + 29];
w7[2] = tmp[idx + 30];
w7[3] = tmp[idx + 31];
sha512_transform (w0, w1, w2, w3, w4, w5, w6, w7, h);
}
}
else
{
for (u32 i = 0, j = loop_pos; i < loop_cnt; i += 128, j += 128)
{
const int idx = (j % pw_len128) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
w0[0] = tmps[gid].tmp[idx + 0];
w0[1] = tmps[gid].tmp[idx + 1];
w0[2] = tmps[gid].tmp[idx + 2];
w0[3] = tmps[gid].tmp[idx + 3];
w1[0] = tmps[gid].tmp[idx + 4];
w1[1] = tmps[gid].tmp[idx + 5];
w1[2] = tmps[gid].tmp[idx + 6];
w1[3] = tmps[gid].tmp[idx + 7];
w2[0] = tmps[gid].tmp[idx + 8];
w2[1] = tmps[gid].tmp[idx + 9];
w2[2] = tmps[gid].tmp[idx + 10];
w2[3] = tmps[gid].tmp[idx + 11];
w3[0] = tmps[gid].tmp[idx + 12];
w3[1] = tmps[gid].tmp[idx + 13];
w3[2] = tmps[gid].tmp[idx + 14];
w3[3] = tmps[gid].tmp[idx + 15];
w4[0] = tmps[gid].tmp[idx + 16];
w4[1] = tmps[gid].tmp[idx + 17];
w4[2] = tmps[gid].tmp[idx + 18];
w4[3] = tmps[gid].tmp[idx + 19];
w5[0] = tmps[gid].tmp[idx + 20];
w5[1] = tmps[gid].tmp[idx + 21];
w5[2] = tmps[gid].tmp[idx + 22];
w5[3] = tmps[gid].tmp[idx + 23];
w6[0] = tmps[gid].tmp[idx + 24];
w6[1] = tmps[gid].tmp[idx + 25];
w6[2] = tmps[gid].tmp[idx + 26];
w6[3] = tmps[gid].tmp[idx + 27];
w7[0] = tmps[gid].tmp[idx + 28];
w7[1] = tmps[gid].tmp[idx + 29];
w7[2] = tmps[gid].tmp[idx + 30];
w7[3] = tmps[gid].tmp[idx + 31];
sha512_transform (w0, w1, w2, w3, w4, w5, w6, w7, h);
}
}
tmps[gid].h[0] = h[0];
tmps[gid].h[1] = h[1];
tmps[gid].h[2] = h[2];
tmps[gid].h[3] = h[3];
tmps[gid].h[4] = h[4];
tmps[gid].h[5] = h[5];
tmps[gid].h[6] = h[6];
tmps[gid].h[7] = h[7];
}
KERNEL_FQ void m27300_comp (KERN_ATTR_TMPS_ESALT (hmac_sha512_tmp_t, snmpv3_t))
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
w0[0] = 0x80000000;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
w4[0] = 0;
w4[1] = 0;
w4[2] = 0;
w4[3] = 0;
w5[0] = 0;
w5[1] = 0;
w5[2] = 0;
w5[3] = 0;
w6[0] = 0;
w6[1] = 0;
w6[2] = 0;
w6[3] = 0;
w7[0] = 0;
w7[1] = 0;
w7[2] = 0;
w7[3] = 1048576 * 8;
u64 h[8];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
h[5] = tmps[gid].h[5];
h[6] = tmps[gid].h[6];
h[7] = tmps[gid].h[7];
sha512_transform (w0, w1, w2, w3, w4, w5, w6, w7, h);
sha512_ctx_t ctx;
sha512_init (&ctx);
u32 w[32];
w[ 0] = h32_from_64_S (h[0]);
w[ 1] = l32_from_64_S (h[0]);
w[ 2] = h32_from_64_S (h[1]);
w[ 3] = l32_from_64_S (h[1]);
w[ 4] = h32_from_64_S (h[2]);
w[ 5] = l32_from_64_S (h[2]);
w[ 6] = h32_from_64_S (h[3]);
w[ 7] = l32_from_64_S (h[3]);
w[ 8] = h32_from_64_S (h[4]);
w[ 9] = l32_from_64_S (h[4]);
w[10] = h32_from_64_S (h[5]);
w[11] = l32_from_64_S (h[5]);
w[12] = h32_from_64_S (h[6]);
w[13] = l32_from_64_S (h[6]);
w[14] = h32_from_64_S (h[7]);
w[15] = l32_from_64_S (h[7]);
w[16] = 0;
w[17] = 0;
w[18] = 0;
w[19] = 0;
w[20] = 0;
w[21] = 0;
w[22] = 0;
w[23] = 0;
w[24] = 0;
w[25] = 0;
w[26] = 0;
w[27] = 0;
w[28] = 0;
w[29] = 0;
w[30] = 0;
w[31] = 0;
sha512_update (&ctx, w, 64);
sha512_update_global_swap (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
w[ 0] = h32_from_64_S (h[0]);
w[ 1] = l32_from_64_S (h[0]);
w[ 2] = h32_from_64_S (h[1]);
w[ 3] = l32_from_64_S (h[1]);
w[ 4] = h32_from_64_S (h[2]);
w[ 5] = l32_from_64_S (h[2]);
w[ 6] = h32_from_64_S (h[3]);
w[ 7] = l32_from_64_S (h[3]);
w[ 8] = h32_from_64_S (h[4]);
w[ 9] = l32_from_64_S (h[4]);
w[10] = h32_from_64_S (h[5]);
w[11] = l32_from_64_S (h[5]);
w[12] = h32_from_64_S (h[6]);
w[13] = l32_from_64_S (h[6]);
w[14] = h32_from_64_S (h[7]);
w[15] = l32_from_64_S (h[7]);
w[16] = 0;
w[17] = 0;
w[18] = 0;
w[19] = 0;
w[20] = 0;
w[21] = 0;
w[22] = 0;
w[23] = 0;
w[24] = 0;
w[25] = 0;
w[26] = 0;
w[27] = 0;
w[28] = 0;
w[29] = 0;
w[30] = 0;
w[31] = 0;
sha512_update (&ctx, w, 64);
sha512_final (&ctx);
w[ 0] = h32_from_64_S (ctx.h[0]);
w[ 1] = l32_from_64_S (ctx.h[0]);
w[ 2] = h32_from_64_S (ctx.h[1]);
w[ 3] = l32_from_64_S (ctx.h[1]);
w[ 4] = h32_from_64_S (ctx.h[2]);
w[ 5] = l32_from_64_S (ctx.h[2]);
w[ 6] = h32_from_64_S (ctx.h[3]);
w[ 7] = l32_from_64_S (ctx.h[3]);
w[ 8] = h32_from_64_S (ctx.h[4]);
w[ 9] = l32_from_64_S (ctx.h[4]);
w[10] = h32_from_64_S (ctx.h[5]);
w[11] = l32_from_64_S (ctx.h[5]);
w[12] = h32_from_64_S (ctx.h[6]);
w[13] = l32_from_64_S (ctx.h[6]);
w[14] = h32_from_64_S (ctx.h[7]);
w[15] = l32_from_64_S (ctx.h[7]);
w[16] = 0;
w[17] = 0;
w[18] = 0;
w[19] = 0;
w[20] = 0;
w[21] = 0;
w[22] = 0;
w[23] = 0;
w[24] = 0;
w[25] = 0;
w[26] = 0;
w[27] = 0;
w[28] = 0;
w[29] = 0;
w[30] = 0;
w[31] = 0;
sha512_hmac_ctx_t hmac_ctx;
sha512_hmac_init (&hmac_ctx, w, 64);
sha512_hmac_update_global_swap (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha512_hmac_final (&hmac_ctx);
const u32 r0 = l32_from_64 (hmac_ctx.opad.h[1]);
const u32 r1 = h32_from_64 (hmac_ctx.opad.h[1]);
const u32 r2 = l32_from_64 (hmac_ctx.opad.h[0]);
const u32 r3 = h32_from_64 (hmac_ctx.opad.h[0]);
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -0,0 +1,351 @@
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_simd.cl"
#include "inc_hash_sha1.cl"
#include "inc_cipher_aes.cl"
#endif
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
typedef struct pbkdf2_sha1_tmp
{
u32 ipad[5];
u32 opad[5];
u32 dgst[32];
u32 out[32];
} pbkdf2_sha1_tmp_t;
typedef struct vmware_vmx
{
u32 salt_buf[64];
u32 iv_buf[4];
u32 ct_buf[4];
} vmware_vmx_t;
DECLSPEC void hmac_sha1_run_V (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *ipad, u32x *opad, u32x *digest)
{
digest[0] = ipad[0];
digest[1] = ipad[1];
digest[2] = ipad[2];
digest[3] = ipad[3];
digest[4] = ipad[4];
sha1_transform_vector (w0, w1, w2, w3, digest);
w0[0] = digest[0];
w0[1] = digest[1];
w0[2] = digest[2];
w0[3] = digest[3];
w1[0] = digest[4];
w1[1] = 0x80000000;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = (64 + 20) * 8;
digest[0] = opad[0];
digest[1] = opad[1];
digest[2] = opad[2];
digest[3] = opad[3];
digest[4] = opad[4];
sha1_transform_vector (w0, w1, w2, w3, digest);
}
KERNEL_FQ void m27400_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha1_tmp_t, vmware_vmx_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
sha1_hmac_ctx_t sha1_hmac_ctx;
sha1_hmac_init_global_swap (&sha1_hmac_ctx, pws[gid].i, pws[gid].pw_len);
tmps[gid].ipad[0] = sha1_hmac_ctx.ipad.h[0];
tmps[gid].ipad[1] = sha1_hmac_ctx.ipad.h[1];
tmps[gid].ipad[2] = sha1_hmac_ctx.ipad.h[2];
tmps[gid].ipad[3] = sha1_hmac_ctx.ipad.h[3];
tmps[gid].ipad[4] = sha1_hmac_ctx.ipad.h[4];
tmps[gid].opad[0] = sha1_hmac_ctx.opad.h[0];
tmps[gid].opad[1] = sha1_hmac_ctx.opad.h[1];
tmps[gid].opad[2] = sha1_hmac_ctx.opad.h[2];
tmps[gid].opad[3] = sha1_hmac_ctx.opad.h[3];
tmps[gid].opad[4] = sha1_hmac_ctx.opad.h[4];
sha1_hmac_update_global_swap (&sha1_hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, salt_bufs[SALT_POS].salt_len);
for (u32 i = 0, j = 1; i < 8; i += 5, j += 1)
{
sha1_hmac_ctx_t sha1_hmac_ctx2 = sha1_hmac_ctx;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = j;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
sha1_hmac_update_64 (&sha1_hmac_ctx2, w0, w1, w2, w3, 4);
sha1_hmac_final (&sha1_hmac_ctx2);
tmps[gid].dgst[i + 0] = sha1_hmac_ctx2.opad.h[0];
tmps[gid].dgst[i + 1] = sha1_hmac_ctx2.opad.h[1];
tmps[gid].dgst[i + 2] = sha1_hmac_ctx2.opad.h[2];
tmps[gid].dgst[i + 3] = sha1_hmac_ctx2.opad.h[3];
tmps[gid].dgst[i + 4] = sha1_hmac_ctx2.opad.h[4];
tmps[gid].out[i + 0] = tmps[gid].dgst[i + 0];
tmps[gid].out[i + 1] = tmps[gid].dgst[i + 1];
tmps[gid].out[i + 2] = tmps[gid].dgst[i + 2];
tmps[gid].out[i + 3] = tmps[gid].dgst[i + 3];
tmps[gid].out[i + 4] = tmps[gid].dgst[i + 4];
}
}
KERNEL_FQ void m27400_loop (KERN_ATTR_TMPS_ESALT (pbkdf2_sha1_tmp_t, vmware_vmx_t))
{
const u64 gid = get_global_id (0);
if ((gid * VECT_SIZE) >= gid_max) return;
u32x ipad[5];
u32x opad[5];
ipad[0] = packv (tmps, ipad, gid, 0);
ipad[1] = packv (tmps, ipad, gid, 1);
ipad[2] = packv (tmps, ipad, gid, 2);
ipad[3] = packv (tmps, ipad, gid, 3);
ipad[4] = packv (tmps, ipad, gid, 4);
opad[0] = packv (tmps, opad, gid, 0);
opad[1] = packv (tmps, opad, gid, 1);
opad[2] = packv (tmps, opad, gid, 2);
opad[3] = packv (tmps, opad, gid, 3);
opad[4] = packv (tmps, opad, gid, 4);
for (u32 i = 0; i < 8; i += 5)
{
u32x dgst[5];
u32x out[5];
dgst[0] = packv (tmps, dgst, gid, i + 0);
dgst[1] = packv (tmps, dgst, gid, i + 1);
dgst[2] = packv (tmps, dgst, gid, i + 2);
dgst[3] = packv (tmps, dgst, gid, i + 3);
dgst[4] = packv (tmps, dgst, gid, i + 4);
out[0] = packv (tmps, out, gid, i + 0);
out[1] = packv (tmps, out, gid, i + 1);
out[2] = packv (tmps, out, gid, i + 2);
out[3] = packv (tmps, out, gid, i + 3);
out[4] = packv (tmps, out, gid, i + 4);
for (u32 j = 0; j < loop_cnt; j++)
{
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
w0[0] = dgst[0];
w0[1] = dgst[1];
w0[2] = dgst[2];
w0[3] = dgst[3];
w1[0] = dgst[4];
w1[1] = 0x80000000;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = (64 + 20) * 8;
hmac_sha1_run_V (w0, w1, w2, w3, ipad, opad, dgst);
out[0] ^= dgst[0];
out[1] ^= dgst[1];
out[2] ^= dgst[2];
out[3] ^= dgst[3];
out[4] ^= dgst[4];
}
unpackv (tmps, dgst, gid, i + 0, dgst[0]);
unpackv (tmps, dgst, gid, i + 1, dgst[1]);
unpackv (tmps, dgst, gid, i + 2, dgst[2]);
unpackv (tmps, dgst, gid, i + 3, dgst[3]);
unpackv (tmps, dgst, gid, i + 4, dgst[4]);
unpackv (tmps, out, gid, i + 0, out[0]);
unpackv (tmps, out, gid, i + 1, out[1]);
unpackv (tmps, out, gid, i + 2, out[2]);
unpackv (tmps, out, gid, i + 3, out[3]);
unpackv (tmps, out, gid, i + 4, out[4]);
}
}
KERNEL_FQ void m27400_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha1_tmp_t, vmware_vmx_t))
{
/**
* base
*/
const u64 gid = get_global_id (0);
const u64 lid = get_local_id(0);
const u64 lsz = get_local_size(0);
/**
* aes shared
*/
#ifdef REAL_SHM
LOCAL_VK u32 s_td0[256];
LOCAL_VK u32 s_td1[256];
LOCAL_VK u32 s_td2[256];
LOCAL_VK u32 s_td3[256];
LOCAL_VK u32 s_td4[256];
LOCAL_VK u32 s_te0[256];
LOCAL_VK u32 s_te1[256];
LOCAL_VK u32 s_te2[256];
LOCAL_VK u32 s_te3[256];
LOCAL_VK u32 s_te4[256];
for (u32 i = lid; i < 256; i += lsz)
{
s_td0[i] = td0[i];
s_td1[i] = td1[i];
s_td2[i] = td2[i];
s_td3[i] = td3[i];
s_td4[i] = td4[i];
s_te0[i] = te0[i];
s_te1[i] = te1[i];
s_te2[i] = te2[i];
s_te3[i] = te3[i];
s_te4[i] = te4[i];
}
SYNC_THREADS();
#else
CONSTANT_AS u32a* s_td0 = td0;
CONSTANT_AS u32a* s_td1 = td1;
CONSTANT_AS u32a* s_td2 = td2;
CONSTANT_AS u32a* s_td3 = td3;
CONSTANT_AS u32a* s_td4 = td4;
CONSTANT_AS u32a* s_te0 = te0;
CONSTANT_AS u32a* s_te1 = te1;
CONSTANT_AS u32a* s_te2 = te2;
CONSTANT_AS u32a* s_te3 = te3;
CONSTANT_AS u32a* s_te4 = te4;
#endif
if (gid >= gid_max) return;
u32 ukey[8];
ukey[0] = tmps[gid].out[0];
ukey[1] = tmps[gid].out[1];
ukey[2] = tmps[gid].out[2];
ukey[3] = tmps[gid].out[3];
ukey[4] = tmps[gid].out[4];
ukey[5] = tmps[gid].out[5];
ukey[6] = tmps[gid].out[6];
ukey[7] = tmps[gid].out[7];
u32 ks[60];
AES256_set_decrypt_key (ks, ukey, s_te0, s_te1, s_te2, s_te3, s_td0, s_td1, s_td2, s_td3);
// iv
u32 iv_buf[4];
iv_buf[0] = esalt_bufs[DIGESTS_OFFSET].iv_buf[0];
iv_buf[1] = esalt_bufs[DIGESTS_OFFSET].iv_buf[1];
iv_buf[2] = esalt_bufs[DIGESTS_OFFSET].iv_buf[2];
iv_buf[3] = esalt_bufs[DIGESTS_OFFSET].iv_buf[3];
// ct
u32 ct_buf[4];
ct_buf[0] = esalt_bufs[DIGESTS_OFFSET].ct_buf[0];
ct_buf[1] = esalt_bufs[DIGESTS_OFFSET].ct_buf[1];
ct_buf[2] = esalt_bufs[DIGESTS_OFFSET].ct_buf[2];
ct_buf[3] = esalt_bufs[DIGESTS_OFFSET].ct_buf[3];
// decrypt first block
u32 pt_buf[4];
AES256_decrypt (ks, ct_buf, pt_buf, s_td0, s_td1, s_td2, s_td3, s_td4);
pt_buf[0] ^= iv_buf[0];
pt_buf[1] ^= iv_buf[1];
pt_buf[2] ^= iv_buf[2];
pt_buf[3] ^= iv_buf[3];
// check
const u32 r0 = pt_buf[0];
const u32 r1 = pt_buf[1];
const u32 r2 = pt_buf[2];
const u32 r3 = pt_buf[3];
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}

@ -117,10 +117,7 @@ KERNEL_FQ void gpu_memset (GLOBAL_AS uint4 *buf, const u32 value, const u64 gid_
#if defined IS_NATIVE
r = value;
#elif defined IS_OPENCL
r.s0 = value;
r.s1 = value;
r.s2 = value;
r.s3 = value;
r = (uint4) (value);
#elif defined IS_CUDA
r.x = value;
r.y = value;
@ -136,6 +133,33 @@ KERNEL_FQ void gpu_memset (GLOBAL_AS uint4 *buf, const u32 value, const u64 gid_
buf[gid] = r;
}
KERNEL_FQ void gpu_bzero(GLOBAL_AS uint4* buf, const u64 gid_max)
{
const u64 gid = get_global_id(0);
if (gid >= gid_max) return;
uint4 r;
#if defined IS_NATIVE
r = 0;
#elif defined IS_OPENCL
r = (uint4) (0);
#elif defined IS_CUDA
r.x = 0;
r.y = 0;
r.z = 0;
r.w = 0;
#elif defined IS_HIP
r.x = 0;
r.y = 0;
r.z = 0;
r.w = 0;
#endif
buf[gid] = r;
}
KERNEL_FQ void gpu_atinit (GLOBAL_AS pw_t *buf, const u64 gid_max)
{
const u64 gid = get_global_id (0);

@ -12,7 +12,7 @@ Download the [latest release](https://hashcat.net/hashcat/) and unpack it in the
### Usage/Help ###
Please refer to the [Hashcat Wiki](https://hashcat.net/wiki/) and the output of `--help` for usage information and general help. A list of frequently asked questions may also be found [here](https://hashcat.net/wiki/doku.php?id=frequently_asked_questions). The [Hashcat Forum](https://hashcat.net/forum/) also contains a plethora of information.
Please refer to the [Hashcat Wiki](https://hashcat.net/wiki/) and the output of `--help` for usage information and general help. A list of frequently asked questions may also be found [here](https://hashcat.net/wiki/doku.php?id=frequently_asked_questions). The [Hashcat Forum](https://hashcat.net/forum/) also contains a plethora of information. If you still think you need help by a real human come to [Discord](https://discord.gg/HFS523HGBT).
### Building ###

@ -1,9 +1,83 @@
* changes v6.2.3 -> v6.2.x
##
## Features
##
- Added option --deprecated-check-disable to enable deprecated plugins
- Added option --multiply-accel-disable (short: -M) to disable multiplying of the kernel accel with the multiprocessor count
- Added rule function '3' to change the case of the first letter after the occurrence of N of character X
- Added support for auto tuning --kernel-threads (-T) at startup
- Added support for HIP version 4.3 or later and removed support for older HIP versions as they are not compatible
##
## Bugs
##
- Fixed broken support for --generate-rules-func-min which was ignored under certain conditions
- Fixed buffer overflow in DPAPI masterkey file v1/v2 module in hash_encode() and hash_decode()
- Fixed buffer overflow in Stargazer Stellar Wallet XLM module in hash_encode() when a hash was cracked
- Fixed false negative in all VeraCrypt hash modes if both conditions are met: 1. Use CPU for cracking and 2. PIM area was used
- Fixed invalid data type in the sha384_hmac_init_vector_128() function that take effect if the vector data type was specified manually
- Fixed out-of-boundary read in input_tokenizer() if the signature in the hash is longer than the length of the plugin's signature constant
- Fixed out-of-boundary read in the Stuffit5 module in hash_decode()
- Fixed random rule generator option --generate-rules-func-min by fixing switch() case to not select a not existing option group type
- Fixed syntax check of HAS_VPERM macro in several kernel includes causing invalid error message for AMD GPUs on Windows
- Fixed uninitialized tmps variable in autotune for slow hashes by calling _init and _prepare kernel before calling _loop kernel
##
## Performance
##
- AMD GPUs: Add inline assembly code for md5crypt, sha256crypt, PDF 1.7, 7-Zip, RAR3, Samsung Android and Windows Phone 8+
- AMD GPUs: On the Apple OpenCL platform, we ask for the preferred kernel thread size rather than hard-coding 32
- Backend Interface: Replace most of the blocking Compute API functions with asynchronous ones to improve GPU utilization
- Blake Kernels: Optimize 3/4 BLAKE2B_ROUND() 64-bit rotations with inline assembly hc_byte_perm_S() calls
- Blowfish Kernels: Backport optimizations to reduce bank conflicts from bcrypt to Password Safe v2 and Open Document Format (ODF) 1.1
- ECC secp256k1: The inline assembly code for AMD GPUs has been removed as the latest JIT compilers optimize it with the same efficiency
- HIP Kernels: Enable vector data types for HIP kernels for functionality and performance
- Kernel threads: Use warp size / wavefront size query instead of hard-coded values as the basis for kernel threads
- SCRYPT Kernels: Improve Hashcat.hctune entries for many NV and AMD GPUs for hash mode 8900, 9300, 15700 and 22700
- Tuning Database: Add new module function module_extra_tuningdb_block() to extend hashcat.hctune content from a module
##
## Technical
##
- 7-Zip Hook: Increase the supported data length from 320kb to 8mb
- ADL: Updated support for AMD Display Library to 15.0, updated data types
- AMD Driver: Updated requirements for AMD Linux drivers to ROCm 4.3 or later due to new HIP interface
- AMD Driver: Updated requirements for AMD Windows drivers to Adrenalin 21.2.1 or later due to new ADL library
- Backend Interface: Implement gpu_bzero() as a gpu_memset() replacement, since all gpu_memset() operations used 0 as the value
- Backend Interface: Improve the query kernel's dynamic memory size based on DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN instead of BF
- Brain Session: Adds hashconfig-specific opti_type and opts_type parameters to the session calculation to enable cover functions like -O
- Commandline: Throw an error if the separator specified by the user with the -p option is not exactly 1 byte
- Constants: Make const char * pointers actually const char * const pointers
- Deprecated Plugins: Add new module function module_deprecated_notice() to mark a plugin as deprecated and to return a free text user notice
- Deprecated Plugins: Marked plugins 2500/2501 and 16800/16801 as deprecated
- Encoding: Truncate password candidates in UTF8 -> UTF16 conversion if it contains an invalid UTF8 byte sequence
- Filehandling: Use const char for fopen mode to fix -Wwrite-strings warnings
- Hardware Monitor: Added support for OverDrive 7 and 8 based GPUs
- HIP Kernels: Dependency on hip/hip runtime.h has been removed to enable easier integration of the HIP backend under Windows
- Kernel cache: Add kernel threads for hash calculation, which will later be used in the file name of the kernel cache
- Memory Management: Refactored the code responsible for limiting kernel accel with the goal to avoid low host memory situations
- OpenCL Runtime: Workaround for Intel OpenCL runtime: segmentation fault when compiling hc_enc_next() / hc_enc_next_global()
- RC4 Kernels: Use improved native thread derivation for RC4-based hash modes 7500, 13100, 18200, 25400
- Shared Memory: Calculate the dynamic memory size of the kernel based on CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN
- Slow kernels: Set some of the slowest kernels to OPTS_TYPE_MP_MULTI_DISABLE to make it easier to handle small word lists
- Vendor Discovery: Add "Intel" as a valid vendor name for GPUs on macOS
##
## Algorithms
##
- Added hash-mode: SNMPv3 HMAC-MD5-96/HMAC-SHA1-96
- Added hash-mode: SNMPv3 HMAC-MD5-96
- Added hash-mode: SNMPv3 HMAC-SHA1-96
- Added hash-mode: SNMPv3 HMAC-SHA224-128
- Added hash-mode: SNMPv3 HMAC-SHA256-192
- Added hash-mode: SNMPv3 HMAC-SHA384-256
- Added hash-mode: SNMPv3 HMAC-SHA512-384
- Added hash-mode: VirtualBox (PBKDF2-HMAC-SHA256 & AES-128-XTS)
- Added hash-mode: VirtualBox (PBKDF2-HMAC-SHA256 & AES-256-XTS)

@ -1,2 +1,3 @@
web: https://hashcat.net
irc: freenode #hashcat
discord: https://discord.gg/HFS523HGBT
irc: Libera Chat #hashcat

@ -10,8 +10,8 @@
hashcat v6.2.3
==============
AMD GPUs on Linux require "RadeonOpenCompute (ROCm)" Software Platform (3.1 or later)
AMD GPUs on Windows require "AMD Radeon Adrenalin 2020 Edition" (20.2.2 or later)
AMD GPUs on Linux require "AMD ROCm" (4.3 or later)
AMD GPUs on Windows require "AMD Radeon Adrenalin 2020 Edition" (21.2.1 or later)
Intel CPUs require "OpenCL Runtime for Intel Core and Intel Xeon Processors" (16.1.1 or later)
NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or later)
@ -155,6 +155,13 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or
- SIP digest authentication (MD5)
- IKE-PSK MD5
- IKE-PSK SHA1
- SNMPv3 HMAC-MD5-96/HMAC-SHA1-96
- SNMPv3 HMAC-MD5-96
- SNMPv3 HMAC-SHA1-96
- SNMPv3 HMAC-SHA224-128
- SNMPv3 HMAC-SHA256-192
- SNMPv3 HMAC-SHA384-256
- SNMPv3 HMAC-SHA512-384
- WPA-EAPOL-PBKDF2
- WPA-EAPOL-PMK
- WPA-PBKDF2-PMKID+EAPOL
@ -265,6 +272,7 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or
- Huawei sha1(md5($pass).$salt)
- AuthMe sha256
- AES Crypt (SHA256)
- VMware VMX (PBKDF2-HMAC-SHA1 + AES-256-CBC)
- LUKS
- VeraCrypt
- BestCrypt v3 Volume Encryption

@ -5,6 +5,7 @@
#define RULE_OP_MANGLE_UREST_LFIRST 'C' // upper case all chars, lower case 1st
#define RULE_OP_MANGLE_TREST 't' // switch the case of each char
#define RULE_OP_MANGLE_TOGGLE_AT 'T' // switch the case of each char on pos N
#define RULE_OP_MANGLE_TOGGLE_AT_SEP '3' // switch the case of the first letter after occurrence N of char X
#define RULE_OP_MANGLE_REVERSE 'r' // reverse word
#define RULE_OP_MANGLE_DUPEWORD 'd' // append word to itself
#define RULE_OP_MANGLE_DUPEWORD_TIMES 'p' // append word to itself N times

@ -24,6 +24,7 @@ We're a group of people participating in the yearly repeating password cracking
| CracktheCon | Cyphercon, Milwaukee | 2019 | 1st |
| Crack Me If You Can | DEF CON, Las Vegas | 2019 | 1st |
| Crack Me If You Can | DEF CON, Remote | 2020 | 1st |
| Crack Me If You Can | DEF CON, Las Vegas | 2021 | 1st |
* Special recognition for team hashcat goes to:

@ -426,7 +426,7 @@ _hashcat ()
local BUILD_IN_CHARSETS='?l ?u ?d ?a ?b ?s ?h ?H'
local SHORT_OPTS="-m -a -V -h -b -t -T -o -p -c -d -D -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z"
local LONG_OPTS="--hash-type --attack-mode --version --help --quiet --benchmark --benchmark-all --hex-salt --hex-wordlist --hex-charset --force --status --status-json --status-timer --stdin-timeout-abort --machine-readable --loopback --markov-hcstat2 --markov-disable --markov-inverse --markov-classic --markov-threshold --runtime --session --speed-only --progress-only --restore --restore-file-path --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-check-timer --outfile-check-dir --wordlist-autohex-disable --separator --show --left --username --remove --remove-timer --potfile-disable --potfile-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --example-hashes --hash-info --backend-ignore-cuda --backend-ignore-opencl --backend-info --backend-devices --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-disable --hwmon-temp-abort --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --stdout --keep-guessing --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --optimized-kernel-enable --self-test-disable --slow-candidates --brain-server --brain-server-timer --brain-client --brain-client-features --brain-host --brain-port --brain-session --brain-session-whitelist --brain-password --identify"
local LONG_OPTS="--hash-type --attack-mode --version --help --quiet --benchmark --benchmark-all --hex-salt --hex-wordlist --hex-charset --force --status --status-json --status-timer --stdin-timeout-abort --machine-readable --loopback --markov-hcstat2 --markov-disable --markov-inverse --markov-classic --markov-threshold --runtime --session --speed-only --progress-only --restore --restore-file-path --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-check-timer --outfile-check-dir --wordlist-autohex-disable --separator --show --deprecated-check-disable --left --username --remove --remove-timer --potfile-disable --potfile-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --example-hashes --hash-info --backend-ignore-cuda --backend-ignore-opencl --backend-info --backend-devices --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-disable --hwmon-temp-abort --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --stdout --keep-guessing --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --optimized-kernel-enable --multiply-accel-disable --self-test-disable --slow-candidates --brain-server --brain-server-timer --brain-client --brain-client-features --brain-host --brain-port --brain-session --brain-session-whitelist --brain-password --identify"
local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --status-timer --stdin-timeout-abort --markov-hcstat2 --markov-threshold --runtime --session --timer --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --potfile-path --restore-file-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --backend-devices --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-temp-abort --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment-min --increment-max --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --brain-server-timer --brain-client-features --brain-host --brain-password --brain-port --brain-session --brain-session-whitelist"
COMPREPLY=()

@ -280,6 +280,14 @@ GeForce_RTX_3090 ALIAS_nv_sm50_or_higher
Device_738c ALIAS_AMD_MI100
AMD_Radeon_(TM)_RX_480_Graphics ALIAS_AMD_RX480
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] ALIAS_AMD_Vega64
AMD_Radeon_Vega_64 ALIAS_AMD_Vega64
Device_73bf ALIAS_AMD_RX6900XT
AMD_Radeon_RX_6900_XT ALIAS_AMD_RX6900XT
#############
## ENTRIES ##
#############
@ -378,146 +386,13 @@ GeForce_GTX_TITAN 3 9900 2 A
DEVICE_TYPE_CPU * 3200 1 N A
##
## SCRYPT
## SCRYPT: Tunings for SCRYPT based hash-modes can be found inside the plugin source
## See function module_extra_tuningdb_block()
##
DEVICE_TYPE_CPU * 8900 1 N A
DEVICE_TYPE_CPU * 9300 1 N A
DEVICE_TYPE_CPU * 15700 1 N A
DEVICE_TYPE_CPU * 22700 1 N A
DEVICE_TYPE_GPU * 8900 1 N A
DEVICE_TYPE_GPU * 9300 1 N A
DEVICE_TYPE_GPU * 15700 1 1 A
DEVICE_TYPE_GPU * 22700 1 N A
##
## CryptoAPI
##
DEVICE_TYPE_CPU * 14500 1 A A
DEVICE_TYPE_GPU * 14500 1 A A
## Here's an example of how to manually tune SCRYPT algorithm kernels for your hardware.
## Manually tuning the GPU will yield increased performance. There is typically no noticeable change to CPU performance.
##
## First, you need to know the parameters of your SCRYPT hash: N, r and p.
##
## The reference SCRYPT parameter values are N=14, r=8 and p=1, but these will likely not match the parameters used by real-world applications.
## For reference, the N value represents an exponent (2^N, which we calculate by bit shifting 1 left by N bits).
## Hashcat expects this N value in decimal format: 1 << 14 = 16384
##
## Now that you have the 3 configuration items in decimal format, multiply them by 128 (underlaying crypto primitive block size).
## For example: 128 * 16384 * 8 * 1 = 16777216 = 16MB
## This is the amount of memory required for the GPU to compute the hash of one password candidate.
##
## Hashcat computes multiple password candidates in parallel - this is what allows for full utilization of the device.
## The number of password candidates that Hashcat can run in parallel is VRAM limited and depends on:
##
## 1. Compute devices' native compute units
## 2. Compute devices' native thread count
## 3. Artificial multiplier (--kernel-accel aka -n)
##
## In order to find these values:
##
## 1. On startup Hashcat will show: * Device #1: GeForce GTX 980, 3963/4043 MB, 16MCU. The 16 MCU is the number of compute units on that device.
## 2. Native thread counts are fixed values: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefronts), GPU-NVIDIA=32 (warps)
##
## Now multiply them together. For my GTX980: 16 * 32 * 16777216 = 8589934592 = 8GB
##
## If we want to actually make use of all computing resources, this GPU would require 8GB of GPU RAM.
## However, it doesn't have that:
##
## Device #1: GeForce GTX 980, 3963/4043 MB, 16MCU. We only have 4043 MB (4GB minus some overhead from the OS).
##
## How do we deal with this? This is where SCRYPT TMTO(time-memory trde off) kicks in. The SCRYPT algorithm is designed in such a way that we
## can pre-compute that 16MB buffer from a self-choosen offset. Details on how this actually works are not important for this process.
##
## What's relevant to us is that we can halve the buffer size, but we pay with twice the computation time.
## We can repeat this as often as we want. That's why it's a trade-off.
##
## This mechanic can be manually set using --scrypt-tmto on the commandline, but this is not the best way.
##
## Back to our problem. We need 8GB of memory but have only ~4GB.
## It's not a full 4GB. The OS needs some of it and Hashcat needs some of it to store password candidates and other things.
## If you run a headless server it should be safe to subtract a fixed value of 200MB from whatever you have in your GPU.
##
## So lets divide our required memory(8GB) by 2 until it fits in our VRAM - 200MB.
##
## (8GB >> 0) = 8GB < 3.8GB = No, Does not fit
## (8GB >> 1) = 4GB < 3.8GB = No, Does not fit
## (8GB >> 2) = 2GB < 3.8GB = Yes!
##
## This process is automated in Hashcat, but it is important to understand what's happening here.
## Because of the light overhead from the OS and Hashcat, we pay a very high price.
## Even though it is just 200MB, it forces us to increase the TMTO by another step.
## In terms of speed, the speed is now only 1/4 of what we could archieve on that same GPU if it had only 8.2GB ram.
## But now we end up in a situation that we waste 1.8GB RAM which costs us ((1.8GB/16MB)>>1) candidates/second.
##
## This is where manual tuning can come into play.
## If we know that the resources we need are close to what we have (in this case 3.8GB <-> 4.0GB)
## We could decide to throw away some of our compute units so that we will no longer need 4.0GB but only 3.8GB.
## Therefore, we do not need to increase the TMTO by another step to fit in VRAM.
##
## If we cut down our 16 MCU to only 15 MCU or 14 MCU using --kernel-accel(-n), we end up with:
##
## 16 * 32 * 16777216 = 8589934592 / 2 = 4294967296 = 4.00GB < 3.80GB = Nope, next
## 15 * 32 * 16777216 = 8053063680 / 2 = 4026531840 = 3.84GB < 3.80GB = Nope, next
## 14 * 32 * 16777216 = 7516192768 / 2 = 3758096384 = 3.58GB < 3.80GB = Yes!
##
## So we can throw away 2/16 compute units, but save half of the computation trade-off on the rest of the compute device.
## On my GTX980, this improves the performance from 163 H/s to 201 H/s.
## You don't need to control --scrypt-tmto manually because now that the multiplier (-n) is smaller than the native value
## Hashcat will automatically realize it can decrease the TMTO by one.
##
## At this point, you found the optimal base value for your compute device. In this case: 14.
##
## Depending on our hardware, especially hardware with very slow memory access like a GPU
## there's a good chance that it's cheaper (faster) to compute an extra step on the GPU register.
## So if we increase the TMTO again by one, this gives an extra speed boost.
##
## On my GTX980, this improves the performance from 201 H/s to 255 H/s.
## Again, there's no need to control this with --scrypt-tmto. Hashcat will realize it has to increase the TMTO again.
##
## All together, you can control all of this by using the -n parameter in the command line.
## This is not ideal in a production environment because you must use the --force flag.
## The best way to set this is by using this Hashcat.hctune file to store it. This avoids the need to bypass any warnings.
##
## Find the ideal -n value, then store it here along with the proper compute device name.
## Formatting guidelines are availabe at the top of this document.
## 4GB
GeForce_GTX_980 * 8900 1 28 A
GeForce_GTX_980 * 9300 1 128 A
GeForce_GTX_980 * 15700 1 28 A
GeForce_GTX_980 * 22700 1 28 A
## 8GB
GeForce_GTX_1080 * 8900 1 14 A
GeForce_GTX_1080 * 9300 1 256 A
GeForce_GTX_1080 * 15700 1 14 A
GeForce_GTX_1080 * 22700 1 14 A
## 11GB
GeForce_RTX_2080_Ti * 8900 1 68 A
GeForce_RTX_2080_Ti * 9300 1 532 A
GeForce_RTX_2080_Ti * 15700 1 68 A
GeForce_RTX_2080_Ti * 22700 1 68 A
## 4GB
AMD_Radeon_(TM)_RX_480_Graphics * 8900 1 14 A
AMD_Radeon_(TM)_RX_480_Graphics * 9300 1 126 A
AMD_Radeon_(TM)_RX_480_Graphics * 15700 1 14 A
AMD_Radeon_(TM)_RX_480_Graphics * 22700 1 14 A
## 8GB
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 8900 1 28 A
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 9300 1 442 A
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 15700 1 28 A
Vega_10_XL/XT_[Radeon_RX_Vega_56/64] * 22700 1 28 A
## 32GB
ALIAS_AMD_MI100 * 8900 1 76 A
ALIAS_AMD_MI100 * 9300 1 63 A
ALIAS_AMD_MI100 * 15700 1 76 A
ALIAS_AMD_MI100 * 22700 1 76 A

@ -9,18 +9,19 @@
#include <stdio.h>
#include <errno.h>
static const char CL_VENDOR_AMD1[] = "Advanced Micro Devices, Inc.";
static const char CL_VENDOR_AMD2[] = "AuthenticAMD";
static const char CL_VENDOR_AMD_USE_INTEL[] = "GenuineIntel";
static const char CL_VENDOR_APPLE[] = "Apple";
static const char CL_VENDOR_APPLE_USE_AMD[] = "AMD";
static const char CL_VENDOR_APPLE_USE_NV[] = "NVIDIA";
static const char CL_VENDOR_APPLE_USE_INTEL[] = "Intel Inc.";
static const char CL_VENDOR_INTEL_BEIGNET[] = "Intel";
static const char CL_VENDOR_INTEL_SDK[] = "Intel(R) Corporation";
static const char CL_VENDOR_MESA[] = "Mesa";
static const char CL_VENDOR_NV[] = "NVIDIA Corporation";
static const char CL_VENDOR_POCL[] = "The pocl project";
static const char CL_VENDOR_AMD1[] = "Advanced Micro Devices, Inc.";
static const char CL_VENDOR_AMD2[] = "AuthenticAMD";
static const char CL_VENDOR_AMD_USE_INTEL[] = "GenuineIntel";
static const char CL_VENDOR_APPLE[] = "Apple";
static const char CL_VENDOR_APPLE_USE_AMD[] = "AMD";
static const char CL_VENDOR_APPLE_USE_NV[] = "NVIDIA";
static const char CL_VENDOR_APPLE_USE_INTEL[] = "Intel";
static const char CL_VENDOR_APPLE_USE_INTEL2[] = "Intel Inc.";
static const char CL_VENDOR_INTEL_BEIGNET[] = "Intel";
static const char CL_VENDOR_INTEL_SDK[] = "Intel(R) Corporation";
static const char CL_VENDOR_MESA[] = "Mesa";
static const char CL_VENDOR_NV[] = "NVIDIA Corporation";
static const char CL_VENDOR_POCL[] = "The pocl project";
int cuda_init (hashcat_ctx_t *hashcat_ctx);
void cuda_close (hashcat_ctx_t *hashcat_ctx);
@ -68,10 +69,12 @@ int hc_cuFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, CUfunction hfunc,
int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
int hc_cuMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
int hc_cuMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
int hc_cuMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr);
int hc_cuMemsetD32Async (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
int hc_cuMemsetD8Async (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod);
@ -85,53 +88,50 @@ int hc_cuLinkAddData (hashcat_ctx_t *hashcat_ctx, CUlinkState state,
int hc_cuLinkDestroy (hashcat_ctx_t *hashcat_ctx, CUlinkState state);
int hc_cuLinkComplete (hashcat_ctx_t *hashcat_ctx, CUlinkState state, void **cubinOut, size_t *sizeOut);
int hc_nvrtcCreateProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
int hc_nvrtcDestroyProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram *prog);
int hc_nvrtcCompileProgram (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int numOptions, const char * const *options);
int hc_nvrtcGetProgramLogSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *logSizeRet);
int hc_nvrtcGetProgramLog (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *log);
int hc_nvrtcGetPTXSize (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, size_t *ptxSizeRet);
int hc_nvrtcGetPTX (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, char *ptx);
int hc_nvrtcVersion (hashcat_ctx_t *hashcat_ctx, int *major, int *minor);
int hc_hipCtxCreate (hashcat_ctx_t *hashcat_ctx, HIPcontext *pctx, unsigned int flags, HIPdevice dev);
int hc_hipCtxDestroy (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx);
int hc_hipCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx);
int hc_hipCtxSetCacheConfig (hashcat_ctx_t *hashcat_ctx, HIPfunc_cache config);
int hc_hipCreateProgram (hashcat_ctx_t *hashcat_ctx, hiprtcProgram *prog, const char *src, const char *name, int numHeaders, const char * const *headers, const char * const *includeNames);
int hc_hipDestroyProgram (hashcat_ctx_t *hashcat_ctx, hiprtcProgram *prog);
int hc_hipCompileProgram (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, int numOptions, const char * const *options);
int hc_hipGetProgramLogSize (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, size_t *logSizeRet);
int hc_hipGetProgramLog (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, char *log);
int hc_hipGetCodeSize (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, size_t *codeSizeRet);
int hc_hipGetCode (hashcat_ctx_t *hashcat_ctx, hiprtcProgram prog, char *code);
int hc_hipCtxCreate (hashcat_ctx_t *hashcat_ctx, hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
int hc_hipCtxDestroy (hashcat_ctx_t *hashcat_ctx, hipCtx_t ctx);
int hc_hipCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, hipCtx_t *pctx);
int hc_hipCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, hipCtx_t ctx);
int hc_hipCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, hipCtx_t ctx);
int hc_hipCtxSynchronize (hashcat_ctx_t *hashcat_ctx);
int hc_hipDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, HIPdevice_attribute attrib, HIPdevice dev);
int hc_hipDeviceGet (hashcat_ctx_t *hashcat_ctx, hipDevice_t *device, int ordinal);
int hc_hipDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
int hc_hipDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count);
int hc_hipDeviceGet (hashcat_ctx_t *hashcat_ctx, HIPdevice *device, int ordinal);
int hc_hipDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, HIPdevice dev);
int hc_hipDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, HIPdevice dev);
int hc_hipDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, hipDevice_t dev);
int hc_hipDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, hipDevice_t dev);
int hc_hipDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion);
int hc_hipEventCreate (hashcat_ctx_t *hashcat_ctx, HIPevent *phEvent, unsigned int Flags);
int hc_hipEventDestroy (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent);
int hc_hipEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, HIPevent hStart, HIPevent hEnd);
int hc_hipEventQuery (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent);
int hc_hipEventRecord (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent, HIPstream hStream);
int hc_hipEventSynchronize (hashcat_ctx_t *hashcat_ctx, HIPevent hEvent);
int hc_hipFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, HIPfunction_attribute attrib, HIPfunction hfunc);
//int hc_hipFuncSetAttribute (hashcat_ctx_t *hashcat_ctx, HIPfunction hfunc, HIPfunction_attribute attrib, int value);
int hc_hipEventCreate (hashcat_ctx_t *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags);
int hc_hipEventDestroy (hashcat_ctx_t *hashcat_ctx, hipEvent_t hEvent);
int hc_hipEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
int hc_hipEventQuery (hashcat_ctx_t *hashcat_ctx, hipEvent_t hEvent);
int hc_hipEventRecord (hashcat_ctx_t *hashcat_ctx, hipEvent_t hEvent, hipStream_t hStream);
int hc_hipEventSynchronize (hashcat_ctx_t *hashcat_ctx, hipEvent_t hEvent);
int hc_hipFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, hipFunction_attribute attrib, hipFunction_t hfunc);
int hc_hipInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags);
int hc_hipLaunchKernel (hashcat_ctx_t *hashcat_ctx, HIPfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, HIPstream hStream, void **kernelParams, void **extra);
int hc_hipMemAlloc (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr *dptr, size_t bytesize);
int hc_hipMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dstDevice, HIPdeviceptr srcDevice, size_t ByteCount);
int hc_hipMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, HIPdeviceptr srcDevice, size_t ByteCount);
int hc_hipMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
int hc_hipMemFree (hashcat_ctx_t *hashcat_ctx, HIPdeviceptr dptr);
int hc_hipModuleGetFunction (hashcat_ctx_t *hashcat_ctx, HIPfunction *hfunc, HIPmodule hmod, const char *name);
int hc_hipModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, HIPmodule *module, const void *image, unsigned int numOptions, HIPjit_option *options, void **optionValues);
int hc_hipModuleUnload (hashcat_ctx_t *hashcat_ctx, HIPmodule hmod);
int hc_hipStreamCreate (hashcat_ctx_t *hashcat_ctx, HIPstream *phStream, unsigned int Flags);
int hc_hipStreamDestroy (hashcat_ctx_t *hashcat_ctx, HIPstream hStream);
int hc_hipStreamSynchronize (hashcat_ctx_t *hashcat_ctx, HIPstream hStream);
int hc_hipCtxPushCurrent (hashcat_ctx_t *hashcat_ctx, HIPcontext ctx);
int hc_hipCtxPopCurrent (hashcat_ctx_t *hashcat_ctx, HIPcontext *pctx);
int hc_hipLinkCreate (hashcat_ctx_t *hashcat_ctx, unsigned int numOptions, HIPjit_option *options, void **optionValues, HIPlinkState *stateOut);
int hc_hipLinkAddData (hashcat_ctx_t *hashcat_ctx, HIPlinkState state, HIPjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, HIPjit_option *options, void **optionValues);
int hc_hipLinkDestroy (hashcat_ctx_t *hashcat_ctx, HIPlinkState state);
int hc_hipLinkComplete (hashcat_ctx_t *hashcat_ctx, HIPlinkState state, void **cubinOut, size_t *sizeOut);
int hc_hipLaunchKernel (hashcat_ctx_t *hashcat_ctx, hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
int hc_hipMemAlloc (hashcat_ctx_t *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
int hc_hipMemFree (hashcat_ctx_t *hashcat_ctx, hipDeviceptr_t dptr);
int hc_hipMemcpyDtoDAsync (hashcat_ctx_t *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
int hc_hipMemcpyDtoHAsync (hashcat_ctx_t *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
int hc_hipMemcpyHtoDAsync (hashcat_ctx_t *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
int hc_hipMemsetD32Async (hashcat_ctx_t *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
int hc_hipMemsetD8Async (hashcat_ctx_t *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
int hc_hipModuleGetFunction (hashcat_ctx_t *hashcat_ctx, hipFunction_t *hfunc, hipModule_t hmod, const char *name);
int hc_hipModuleGetGlobal (hashcat_ctx_t *hashcat_ctx, hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
int hc_hipModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
int hc_hipModuleUnload (hashcat_ctx_t *hashcat_ctx, hipModule_t hmod);
int hc_hipRuntimeGetVersion (hashcat_ctx_t *hashcat_ctx, int *runtimeVersion);
int hc_hipStreamCreate (hashcat_ctx_t *hashcat_ctx, hipStream_t *phStream, unsigned int Flags);
int hc_hipStreamDestroy (hashcat_ctx_t *hashcat_ctx, hipStream_t hStream);
int hc_hipStreamSynchronize (hashcat_ctx_t *hashcat_ctx, hipStream_t hStream);
int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
int hc_clCompileProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, cl_uint num_input_headers, const cl_program *input_headers, const char **header_include_names, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data);
@ -142,6 +142,7 @@ int hc_clCreateKernel (hashcat_ctx_t *hashcat_ctx, cl_program program
int hc_clCreateProgramWithBinary (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status, cl_program *program);
int hc_clCreateProgramWithSource (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_program *program);
int hc_clEnqueueCopyBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
int hc_clEnqueueFillBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
int hc_clEnqueueMapBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, void **buf);
int hc_clEnqueueNDRangeKernel (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
int hc_clEnqueueReadBuffer (hashcat_ctx_t *hashcat_ctx, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
@ -177,17 +178,20 @@ void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u64 p
int run_cuda_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num);
int run_cuda_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 num);
int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u32 value, const u64 size);
int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u8 value, const u64 size);
int run_cuda_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u32 value, const u64 size);
int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size);
int run_hip_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 num);
int run_hip_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 num);
int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u32 value, const u64 size);
int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 size);
int run_hip_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 num);
int run_hip_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 num);
int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u8 value, const u64 size);
int run_hip_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u32 value, const u64 size);
int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 size);
int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num);
int run_opencl_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num);
int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size);
int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 offset, const u8 value, const u64 size);
int run_opencl_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 offset, const u32 value, const u64 size);
int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size);
int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 pws_pos, const u64 num, const u32 event_update, const u32 iteration);

@ -63,6 +63,14 @@
#define HC_API_CALL
#endif
#if defined (__GNUC__)
#define HC_ALIGN(x) __attribute__((aligned(x)))
#elif defined (_MSC_VER)
#define HC_ALIGN(x) __declspec(align(x))
#else
#define HC_ALIGN(x)
#endif
#if defined (_WIN)
#define WIN32_LEAN_AND_MEAN
#endif

@ -13,228 +13,569 @@
#include <windows.h>
#endif // _WIN
/**
* Declarations from adl_sdk.h and subheaders
*/
// Declarations from:
// https://github.com/GPUOpen-LibrariesAndSDKs/display-library/blob/209538e1dc7273f7459411a3a5044ffe2437ed95/include/adl_defines.h
// https://github.com/GPUOpen-LibrariesAndSDKs/display-library/blob/209538e1dc7273f7459411a3a5044ffe2437ed95/include/adl_structures.h
#define ADL_OK 0
#define ADL_ERR -1
#define ADL_ERR_NOT_SUPPORTED -8
/// Defines ADL_TRUE
#define ADL_TRUE 1
/// Defines ADL_FALSE
#define ADL_FALSE 0
#define ADL_MAX_PATH 256
//Define Performance Metrics Log max sensors number
#define ADL_PMLOG_MAX_SENSORS 256
#define ADL_DL_FANCTRL_SPEED_TYPE_PERCENT 1
#define ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED 1
typedef enum ADLSensorType
{
SENSOR_MAXTYPES = 0,
PMLOG_CLK_GFXCLK = 1,
PMLOG_CLK_MEMCLK = 2,
PMLOG_CLK_SOCCLK = 3,
PMLOG_CLK_UVDCLK1 = 4,
PMLOG_CLK_UVDCLK2 = 5,
PMLOG_CLK_VCECLK = 6,
PMLOG_CLK_VCNCLK = 7,
PMLOG_TEMPERATURE_EDGE = 8,
PMLOG_TEMPERATURE_MEM = 9,
PMLOG_TEMPERATURE_VRVDDC = 10,
PMLOG_TEMPERATURE_VRMVDD = 11,
PMLOG_TEMPERATURE_LIQUID = 12,
PMLOG_TEMPERATURE_PLX = 13,
PMLOG_FAN_RPM = 14,
PMLOG_FAN_PERCENTAGE = 15,
PMLOG_SOC_VOLTAGE = 16,
PMLOG_SOC_POWER = 17,
PMLOG_SOC_CURRENT = 18,
PMLOG_INFO_ACTIVITY_GFX = 19,
PMLOG_INFO_ACTIVITY_MEM = 20,
PMLOG_GFX_VOLTAGE = 21,
PMLOG_MEM_VOLTAGE = 22,
PMLOG_ASIC_POWER = 23,
PMLOG_TEMPERATURE_VRSOC = 24,
PMLOG_TEMPERATURE_VRMVDD0 = 25,
PMLOG_TEMPERATURE_VRMVDD1 = 26,
PMLOG_TEMPERATURE_HOTSPOT = 27,
PMLOG_TEMPERATURE_GFX = 28,
PMLOG_TEMPERATURE_SOC = 29,
PMLOG_GFX_POWER = 30,
PMLOG_GFX_CURRENT = 31,
PMLOG_TEMPERATURE_CPU = 32,
PMLOG_CPU_POWER = 33,
PMLOG_CLK_CPUCLK = 34,
PMLOG_THROTTLER_STATUS = 35,
PMLOG_CLK_VCN1CLK1 = 36,
PMLOG_CLK_VCN1CLK2 = 37,
PMLOG_SMART_POWERSHIFT_CPU = 38,
PMLOG_SMART_POWERSHIFT_DGPU = 39,
PMLOG_BUS_SPEED = 40,
PMLOG_BUS_LANES = 41,
PMLOG_MAX_SENSORS_REAL
} ADLSensorType;
/// Defines the maximum string length
#define ADL_MAX_CHAR 4096
/// Defines the maximum string length
#define ADL_MAX_PATH 256
/// Defines the maximum number of supported adapters
#define ADL_MAX_ADAPTERS 250
/// Defines the maxumum number of supported displays
#define ADL_MAX_DISPLAYS 150
/// Defines the maxumum string length for device name
#define ADL_MAX_DEVICENAME 32
/// Defines for all adapters
#define ADL_ADAPTER_INDEX_ALL -1
/// \defgroup define_adl_results Result Codes
/// This group of definitions are the various results returned by all ADL functions \n
/// @{
/// All OK, but need to wait
#define ADL_OK_WAIT 4
/// All OK, but need restart
#define ADL_OK_RESTART 3
/// All OK but need mode change
#define ADL_OK_MODE_CHANGE 2
/// All OK, but with warning
#define ADL_OK_WARNING 1
/// ADL function completed successfully
#define ADL_OK 0
/// Generic Error. Most likely one or more of the Escape calls to the driver failed!
#define ADL_ERR -1
/// ADL not initialized
#define ADL_ERR_NOT_INIT -2
/// One of the parameter passed is invalid
#define ADL_ERR_INVALID_PARAM -3
/// One of the parameter size is invalid
#define ADL_ERR_INVALID_PARAM_SIZE -4
/// Invalid ADL index passed
#define ADL_ERR_INVALID_ADL_IDX -5
/// Invalid controller index passed
#define ADL_ERR_INVALID_CONTROLLER_IDX -6
/// Invalid display index passed
#define ADL_ERR_INVALID_DIPLAY_IDX -7
/// Function not supported by the driver
#define ADL_ERR_NOT_SUPPORTED -8
/// Null Pointer error
#define ADL_ERR_NULL_POINTER -9
/// Call can't be made due to disabled adapter
#define ADL_ERR_DISABLED_ADAPTER -10
/// Invalid Callback
#define ADL_ERR_INVALID_CALLBACK -11
/// Display Resource conflict
#define ADL_ERR_RESOURCE_CONFLICT -12
//Failed to update some of the values. Can be returned by set request that include multiple values if not all values were successfully committed.
#define ADL_ERR_SET_INCOMPLETE -20
/// There's no Linux XDisplay in Linux Console environment
#define ADL_ERR_NO_XDISPLAY -21
//values for ADLFanSpeedValue.iSpeedType
#define ADL_DL_FANCTRL_SPEED_TYPE_PERCENT 1
#define ADL_DL_FANCTRL_SPEED_TYPE_RPM 2
//values for ADLFanSpeedValue.iFlags
#define ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED 1
/**
* Declarations from adl_structures.h
*/
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about the graphics adapter.
///
/// This structure is used to store various information about the graphics adapter. This
/// information can be returned to the user. Alternatively, it can be used to access various driver calls to set
/// or fetch various settings upon the user's request.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct AdapterInfo
{
int iSize;
int iAdapterIndex;
char strUDID[ADL_MAX_PATH];
int iBusNumber;
int iDeviceNumber;
int iFunctionNumber;
int iVendorID;
char strAdapterName[ADL_MAX_PATH];
char strDisplayName[ADL_MAX_PATH];
int iPresent;
#if defined (_WIN32) || defined (_WIN64) || defined (__CYGWIN__)
int iExist;
char strDriverPath[ADL_MAX_PATH];
char strDriverPathExt[ADL_MAX_PATH];
char strPNPString[ADL_MAX_PATH];
int iOSDisplayIndex;
#endif /* (_WIN32) || (_WIN64) || (__CYGWIN__) */
#if defined (__linux__)
int iXScreenNum;
int iDrvIndex;
char strXScreenConfigName[ADL_MAX_PATH];
#endif /* (__linux__) */
/// \ALL_STRUCT_MEM
/// Size of the structure.
int iSize;
/// The ADL index handle. One GPU may be associated with one or two index handles
int iAdapterIndex;
/// The unique device ID associated with this adapter.
char strUDID[ADL_MAX_PATH];
/// The BUS number associated with this adapter.
int iBusNumber;
/// The driver number associated with this adapter.
int iDeviceNumber;
/// The function number.
int iFunctionNumber;
/// The vendor ID associated with this adapter.
int iVendorID;
/// Adapter name.
char strAdapterName[ADL_MAX_PATH];
/// Display name. For example, "\\\\Display0" for Windows or ":0:0" for Linux.
char strDisplayName[ADL_MAX_PATH];
/// Present or not; 1 if present and 0 if not present.It the logical adapter is present, the display name such as \\\\.\\Display1 can be found from OS
int iPresent;
#if defined (_WIN32) || defined (_WIN64)
/// \WIN_STRUCT_MEM
/// Exist or not; 1 is exist and 0 is not present.
int iExist;
/// Driver registry path.
char strDriverPath[ADL_MAX_PATH];
/// Driver registry path Ext for.
char strDriverPathExt[ADL_MAX_PATH];
/// PNP string from Windows.
char strPNPString[ADL_MAX_PATH];
/// It is generated from EnumDisplayDevices.
int iOSDisplayIndex;
#endif /* (_WIN32) || (_WIN64) */
#if defined (LINUX)
/// \LNX_STRUCT_MEM
/// Internal X screen number from GPUMapInfo (DEPRICATED use XScreenInfo)
int iXScreenNum;
/// Internal driver index from GPUMapInfo
int iDrvIndex;
/// \deprecated Internal x config file screen identifier name. Use XScreenInfo instead.
char strXScreenConfigName[ADL_MAX_PATH];
#endif /* (LINUX) */
} AdapterInfo, *LPAdapterInfo;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about thermal controller.
///
/// This structure is used to store information about thermal controller.
/// This structure is used by ADL_PM_ThermalDevices_Enum.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLThermalControllerInfo
{
/// Must be set to the size of the structure
int iSize;
/// Possible valies: \ref ADL_DL_THERMAL_DOMAIN_OTHER or \ref ADL_DL_THERMAL_DOMAIN_GPU.
int iThermalDomain;
/// GPU 0, 1, etc.
int iDomainIndex;
/// Possible valies: \ref ADL_DL_THERMAL_FLAG_INTERRUPT or \ref ADL_DL_THERMAL_FLAG_FANCONTROL
int iFlags;
} ADLThermalControllerInfo;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about thermal controller temperature.
///
/// This structure is used to store information about thermal controller temperature.
/// This structure is used by the ADL_PM_Temperature_Get() function.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLTemperature
{
/// Must be set to the size of the structure
int iSize;
/// Temperature in millidegrees Celsius.
int iTemperature;
} ADLTemperature;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about thermal controller fan speed.
///
/// This structure is used to store information about thermal controller fan speed.
/// This structure is used by the ADL_PM_FanSpeedInfo_Get() function.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLFanSpeedInfo
{
/// Must be set to the size of the structure
int iSize;
/// \ref define_fanctrl
int iFlags;
/// Minimum possible fan speed value in percents.
int iMinPercent;
/// Maximum possible fan speed value in percents.
int iMaxPercent;
/// Minimum possible fan speed value in RPM.
int iMinRPM;
/// Maximum possible fan speed value in RPM.
int iMaxRPM;
} ADLFanSpeedInfo;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about fan speed reported by thermal controller.
///
/// This structure is used to store information about fan speed reported by thermal controller.
/// This structure is used by the ADL_Overdrive5_FanSpeed_Get() and ADL_Overdrive5_FanSpeed_Set() functions.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLFanSpeedValue
{
/// Must be set to the size of the structure
int iSize;
/// Possible valies: \ref ADL_DL_FANCTRL_SPEED_TYPE_PERCENT or \ref ADL_DL_FANCTRL_SPEED_TYPE_RPM
int iSpeedType;
/// Fan speed value
int iFanSpeed;
/// The only flag for now is: \ref ADL_DL_FANCTRL_FLAG_USER_DEFINED_SPEED
int iFlags;
} ADLFanSpeedValue;
typedef struct ADLDisplayID
{
int iDisplayLogicalIndex;
int iDisplayPhysicalIndex;
int iDisplayLogicalAdapterIndex;
int iDisplayPhysicalAdapterIndex;
} ADLDisplayID, *LPADLDisplayID;
typedef struct ADLDisplayInfo
{
ADLDisplayID displayID;
int iDisplayControllerIndex;
char strDisplayName[ADL_MAX_PATH];
char strDisplayManufacturerName[ADL_MAX_PATH];
int iDisplayType;
int iDisplayOutputType;
int iDisplayConnector;
int iDisplayInfoMask;
int iDisplayInfoValue;
} ADLDisplayInfo, *LPADLDisplayInfo;
typedef struct ADLBiosInfo
{
char strPartNumber[ADL_MAX_PATH];
char strVersion[ADL_MAX_PATH];
char strDate[ADL_MAX_PATH];
} ADLBiosInfo, *LPADLBiosInfo;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about current power management related activity.
///
/// This structure is used to store information about current power management related activity.
/// This structure (Overdrive 5 interfaces) is used by the ADL_PM_CurrentActivity_Get() function.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLPMActivity
{
int iSize;
int iEngineClock;
int iMemoryClock;
int iVddc;
int iActivityPercent;
int iCurrentPerformanceLevel;
int iCurrentBusSpeed;
int iCurrentBusLanes;
int iMaximumBusLanes;
int iReserved;
/// Must be set to the size of the structure
int iSize;
/// Current engine clock.
int iEngineClock;
/// Current memory clock.
int iMemoryClock;
/// Current core voltage.
int iVddc;
/// GPU utilization.
int iActivityPercent;
/// Performance level index.
int iCurrentPerformanceLevel;
/// Current PCIE bus speed.
int iCurrentBusSpeed;
/// Number of PCIE bus lanes.
int iCurrentBusLanes;
/// Maximum number of PCIE bus lanes.
int iMaximumBusLanes;
/// Reserved for future purposes.
int iReserved;
} ADLPMActivity;
////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing the range of Overdrive parameter.
///
/// This structure is used to store information about the range of Overdrive parameter.
/// This structure is used by ADLODParameters.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLODParameterRange
{
/// Minimum parameter value.
int iMin;
/// Maximum parameter value.
int iMax;
/// Parameter step value.
int iStep;
} ADLODParameterRange;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive parameters.
///
/// This structure is used to store information about Overdrive parameters.
/// This structure is used by the ADL_Overdrive5_ODParameters_Get() function.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLODParameters
{
/// Must be set to the size of the structure
int iSize;
/// Number of standard performance states.
int iNumberOfPerformanceLevels;
/// Indicates whether the GPU is capable to measure its activity.
int iActivityReportingSupported;
/// Indicates whether the GPU supports discrete performance levels or performance range.
int iDiscretePerformanceLevels;
/// Reserved for future use.
int iReserved;
/// Engine clock range.
ADLODParameterRange sEngineClock;
/// Memory clock range.
ADLODParameterRange sMemoryClock;
/// Core voltage range.
ADLODParameterRange sVddc;
} ADLODParameters;
typedef struct ADLODPerformanceLevel
{
int iEngineClock;
int iMemoryClock;
int iVddc;
} ADLODPerformanceLevel;
/*
* Attention: we had to change this struct due to an out-of-bound problem mentioned here:
* https://github.com/hashcat/hashcat/issues/244
* the change: ADLODPerformanceLevel aLevels [1] -> ADLODPerformanceLevel aLevels [2]
*/
typedef struct ADLODPerformanceLevels
{
int iSize;
int iReserved;
ADLODPerformanceLevel aLevels [2];
} ADLODPerformanceLevels;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive 6 fan speed information
///
/// This structure is used to store information about Overdrive 6 fan speed information
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLOD6FanSpeedInfo
{
int iSpeedType;
int iFanSpeedPercent;
int iFanSpeedRPM;
int iExtValue;
int iExtMask;
/// Contains a bitmap of the valid fan speed type flags. Possible values: \ref ADL_OD6_FANSPEED_TYPE_PERCENT, \ref ADL_OD6_FANSPEED_TYPE_RPM, \ref ADL_OD6_FANSPEED_USER_DEFINED
int iSpeedType;
/// Contains current fan speed in percent (if valid flag exists in iSpeedType)
int iFanSpeedPercent;
/// Contains current fan speed in RPM (if valid flag exists in iSpeedType)
int iFanSpeedRPM;
/// Value for future extension
int iExtValue;
/// Mask for future extension
int iExtMask;
} ADLOD6FanSpeedInfo;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive 6 fan speed value
///
/// This structure is used to store information about Overdrive 6 fan speed value
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLOD6FanSpeedValue
{
int iSpeedType;
int iFanSpeed;
int iExtValue;
int iExtMask;
/// Indicates the units of the fan speed. Possible values: \ref ADL_OD6_FANSPEED_TYPE_PERCENT, \ref ADL_OD6_FANSPEED_TYPE_RPM
int iSpeedType;
/// Fan speed value (units as indicated above)
int iFanSpeed;
/// Value for future extension
int iExtValue;
/// Mask for future extension
int iExtMask;
} ADLOD6FanSpeedValue;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about current Overdrive 6 performance status.
///
/// This structure is used to store information about current Overdrive 6 performance status.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLOD6CurrentStatus
{
int iEngineClock;
int iMemoryClock;
int iActivityPercent;
int iCurrentPerformanceLevel;
int iCurrentBusSpeed;
int iCurrentBusLanes;
int iMaximumBusLanes;
int iExtValue;
int iExtMask;
/// Current engine clock in 10 KHz.
int iEngineClock;
/// Current memory clock in 10 KHz.
int iMemoryClock;
/// Current GPU activity in percent. This
/// indicates how "busy" the GPU is.
int iActivityPercent;
/// Not used. Reserved for future use.
int iCurrentPerformanceLevel;
/// Current PCI-E bus speed
int iCurrentBusSpeed;
/// Current PCI-E bus # of lanes
int iCurrentBusLanes;
/// Maximum possible PCI-E bus # of lanes
int iMaximumBusLanes;
/// Value for future extension
int iExtValue;
/// Mask for future extension
int iExtMask;
} ADLOD6CurrentStatus;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive 6 clock range
///
/// This structure is used to store information about Overdrive 6 clock range
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLOD6ParameterRange
{
int iMin;
int iMax;
int iStep;
/// The starting value of the clock range
int iMin;
/// The ending value of the clock range
int iMax;
/// The minimum increment between clock values
int iStep;
} ADLOD6ParameterRange;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive 6 capabilities
///
/// This structure is used to store information about Overdrive 6 capabilities
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLOD6Capabilities
{
int iCapabilities;
int iSupportedStates;
int iNumberOfPerformanceLevels;
ADLOD6ParameterRange sEngineClockRange;
ADLOD6ParameterRange sMemoryClockRange;
int iExtValue;
int iExtMask;
/// Contains a bitmap of the OD6 capability flags. Possible values: \ref ADL_OD6_CAPABILITY_SCLK_CUSTOMIZATION,
/// \ref ADL_OD6_CAPABILITY_MCLK_CUSTOMIZATION, \ref ADL_OD6_CAPABILITY_GPU_ACTIVITY_MONITOR
int iCapabilities;
/// Contains a bitmap indicating the power states
/// supported by OD6. Currently only the performance state
/// is supported. Possible Values: \ref ADL_OD6_SUPPORTEDSTATE_PERFORMANCE
int iSupportedStates;
/// Number of levels. OD6 will always use 2 levels, which describe
/// the minimum to maximum clock ranges.
/// The 1st level indicates the minimum clocks, and the 2nd level
/// indicates the maximum clocks.
int iNumberOfPerformanceLevels;
/// Contains the hard limits of the sclk range. Overdrive
/// clocks cannot be set outside this range.
ADLOD6ParameterRange sEngineClockRange;
/// Contains the hard limits of the mclk range. Overdrive
/// clocks cannot be set outside this range.
ADLOD6ParameterRange sMemoryClockRange;
/// Value for future extension
int iExtValue;
/// Mask for future extension
int iExtMask;
} ADLOD6Capabilities;
typedef struct ADLOD6PerformanceLevel
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive level.
///
/// This structure is used to store information about Overdrive level.
/// This structure is used by ADLODPerformanceLevels.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLODPerformanceLevel
{
/// Engine clock.
int iEngineClock;
/// Memory clock.
int iMemoryClock;
} ADLOD6PerformanceLevel;
/// Core voltage.
int iVddc;
} ADLODPerformanceLevel;
/*
* Attention: we had to change this struct due to an out-of-bound problem mentioned here:
* https://github.com/hashcat/hashcat/issues/244
* the change: ADLOD6PerformanceLevel aLevels [1] -> ADLOD6PerformanceLevel aLevels [2]
*/
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive 6 clock values.
///
/// This structure is used to store information about Overdrive 6 clock values.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLOD6PerformanceLevel
{
/// Engine (core) clock.
int iEngineClock;
/// Memory clock.
int iMemoryClock;
} ADLOD6PerformanceLevel;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive 6 clocks.
///
/// This structure is used to store information about Overdrive 6 clocks. This is a
/// variable-sized structure. iNumberOfPerformanceLevels indicate how many elements
/// are contained in the aLevels array.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLOD6StateInfo
{
int iNumberOfPerformanceLevels;
int iExtValue;
int iExtMask;
ADLOD6PerformanceLevel aLevels [2];
/// Number of levels. OD6 uses clock ranges instead of discrete performance levels.
/// iNumberOfPerformanceLevels is always 2. The 1st level indicates the minimum clocks
/// in the range. The 2nd level indicates the maximum clocks in the range.
int iNumberOfPerformanceLevels;
/// Value for future extension
int iExtValue;
/// Mask for future extension
int iExtMask;
/// Variable-sized array of levels.
/// The number of elements in the array is specified by iNumberofPerformanceLevels.
ADLOD6PerformanceLevel aLevels [1];
} ADLOD6StateInfo;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Overdrive performance levels.
///
/// This structure is used to store information about Overdrive performance levels.
/// This structure is used by the ADL_Overdrive5_ODPerformanceLevels_Get() and ADL_Overdrive5_ODPerformanceLevels_Set() functions.
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLODPerformanceLevels
{
/// Must be set to sizeof( \ref ADLODPerformanceLevels ) + sizeof( \ref ADLODPerformanceLevel ) * (ADLODParameters.iNumberOfPerformanceLevels - 1)
int iSize;
int iReserved;
/// Array of performance state descriptors. Must have ADLODParameters.iNumberOfPerformanceLevels elements.
ADLODPerformanceLevel aLevels [1];
} ADLODPerformanceLevels;
/////////////////////////////////////////////////////////////////////////////////////////////
///\brief Structure containing information about Performance Metrics data
///
/// This structure is used to store information about Performance Metrics data output
/// \nosubgrouping
////////////////////////////////////////////////////////////////////////////////////////////
typedef struct ADLSingleSensorData
{
int supported;
int value;
} ADLSingleSensorData;
typedef struct ADLPMLogDataOutput
{
int size;
ADLSingleSensorData sensors[ADL_PMLOG_MAX_SENSORS];
}ADLPMLogDataOutput;
/// \brief Handle to ADL client context.
///
/// ADL clients obtain context handle from initial call to \ref ADL2_Main_Control_Create.
/// Clients have to pass the handle to each subsequent ADL call and finally destroy
/// the context with call to \ref ADL2_Main_Control_Destroy
/// \nosubgrouping
typedef void *ADL_CONTEXT_HANDLE;
#if defined (__MSC_VER)
#define ADL_API_CALL __cdecl
#elif defined (_WIN32) || defined (__WIN32__)
@ -251,62 +592,51 @@ typedef void* (ADL_API_CALL *ADL_MAIN_MALLOC_CALLBACK )( int );
typedef int HM_ADAPTER_ADL;
typedef struct struct_ADLOD6MemClockState
{
ADLOD6StateInfo state;
ADLOD6PerformanceLevel level;
} ADLOD6MemClockState;
typedef int (ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY) (void);
typedef int (ADL_API_CALL *ADL_MAIN_CONTROL_CREATE) (ADL_MAIN_MALLOC_CALLBACK, int);
typedef int (ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET) (int *);
typedef int (ADL_API_CALL *ADL_ADAPTER_ADAPTERINFO_GET) (LPAdapterInfo, int);
typedef int (ADL_API_CALL *ADL_DISPLAY_DISPLAYINFO_GET) (int, int *, ADLDisplayInfo **, int);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_TEMPERATURE_GET) (int, int, ADLTemperature *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_TEMPERATURE_GET) (int, int *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_CURRENTACTIVITY_GET) (int, ADLPMActivity *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_THERMALDEVICES_ENUM) (int, int, ADLThermalControllerInfo *);
typedef int (ADL_API_CALL *ADL_ADAPTER_ID_GET) (int, int *);
typedef int (ADL_API_CALL *ADL_ADAPTER_VIDEOBIOSINFO_GET) (int, ADLBiosInfo *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_FANSPEEDINFO_GET) (int, int, ADLFanSpeedInfo *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_FANSPEED_GET) (int, int, ADLFanSpeedValue *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_FANSPEED_GET) (int, ADLOD6FanSpeedInfo *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_ODPARAMETERS_GET) (int, ADLODParameters *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_ODPERFORMANCELEVELS_GET) (int, int, ADLODPerformanceLevels *);
typedef int (ADL_API_CALL *ADL_ADAPTER_ACTIVE_GET) (int, int *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE_CAPS) (int, int *, int *, int *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_CURRENTSTATUS_GET) (int, ADLOD6CurrentStatus *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_STATEINFO_GET) (int, int, ADLOD6MemClockState *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_CAPABILITIES_GET) (int, ADLOD6Capabilities *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_TARGETTEMPERATUREDATA_GET) (int, int *, int *);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_TARGETTEMPERATURERANGEINFO_GET) (int, ADLOD6ParameterRange *);
typedef int (ADL_API_CALL *ADL_ADAPTER_ACTIVE_GET ) ( int, int* );
typedef int (ADL_API_CALL *ADL_ADAPTER_ADAPTERINFO_GET ) ( LPAdapterInfo, int );
typedef int (ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET ) ( int* );
typedef int (ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )(ADL_MAIN_MALLOC_CALLBACK, int );
typedef int (ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )();
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_CURRENTACTIVITY_GET ) (int iAdapterIndex, ADLPMActivity *lpActivity);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_FANSPEEDINFO_GET ) (int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedInfo *lpFanSpeedInfo);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_FANSPEED_GET ) (int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_ODPARAMETERS_GET ) (int iAdapterIndex, ADLODParameters *lpOdParameters);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_ODPERFORMANCELEVELS_GET ) (int iAdapterIndex, int iDefault, ADLODPerformanceLevels *lpOdPerformanceLevels);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_TEMPERATURE_GET ) (int iAdapterIndex, int iThermalControllerIndex, ADLTemperature *lpTemperature);
typedef int (ADL_API_CALL *ADL_OVERDRIVE5_THERMALDEVICES_ENUM ) (int iAdapterIndex, int iThermalControllerIndex, ADLThermalControllerInfo *lpThermalControllerInfo);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_CAPABILITIES_GET ) (int iAdapterIndex, ADLOD6Capabilities *lpODCapabilities);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_CURRENTSTATUS_GET )(int iAdapterIndex, ADLOD6CurrentStatus *lpCurrentStatus);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_FANSPEED_GET )(int iAdapterIndex, ADLOD6FanSpeedInfo *lpFanSpeedInfo);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_STATEINFO_GET )(int iAdapterIndex, int iStateType, ADLOD6StateInfo *lpStateInfo);
typedef int (ADL_API_CALL *ADL_OVERDRIVE6_TEMPERATURE_GET )(int iAdapterIndex, int *lpTemperature);
typedef int (ADL_API_CALL *ADL_OVERDRIVE_CAPS ) (int iAdapterIndex, int *iSupported, int *iEnabled, int *iVersion);
typedef int (ADL_API_CALL *ADL2_OVERDRIVE_CAPS) (ADL_CONTEXT_HANDLE context, int iAdapterIndex, int * iSupported, int * iEnabled, int * iVersion);
typedef int (ADL_API_CALL *ADL2_NEW_QUERYPMLOGDATA_GET) (ADL_CONTEXT_HANDLE, int, ADLPMLogDataOutput*);
typedef struct hm_adl_lib
{
hc_dynlib_t lib;
ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
ADL_ADAPTER_ACTIVE_GET ADL_Adapter_Active_Get;
ADL_ADAPTER_ADAPTERINFO_GET ADL_Adapter_AdapterInfo_Get;
ADL_DISPLAY_DISPLAYINFO_GET ADL_Display_DisplayInfo_Get;
ADL_ADAPTER_ID_GET ADL_Adapter_ID_Get;
ADL_ADAPTER_VIDEOBIOSINFO_GET ADL_Adapter_VideoBiosInfo_Get;
ADL_OVERDRIVE5_THERMALDEVICES_ENUM ADL_Overdrive5_ThermalDevices_Enum;
ADL_OVERDRIVE5_TEMPERATURE_GET ADL_Overdrive5_Temperature_Get;
ADL_OVERDRIVE6_TEMPERATURE_GET ADL_Overdrive6_Temperature_Get;
ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
ADL_OVERDRIVE5_CURRENTACTIVITY_GET ADL_Overdrive5_CurrentActivity_Get;
ADL_OVERDRIVE5_FANSPEEDINFO_GET ADL_Overdrive5_FanSpeedInfo_Get;
ADL_OVERDRIVE5_FANSPEED_GET ADL_Overdrive5_FanSpeed_Get;
ADL_OVERDRIVE6_FANSPEED_GET ADL_Overdrive6_FanSpeed_Get;
ADL_ADAPTER_ACTIVE_GET ADL_Adapter_Active_Get;
ADL_OVERDRIVE_CAPS ADL_Overdrive_Caps;
ADL_OVERDRIVE5_ODPARAMETERS_GET ADL_Overdrive5_ODParameters_Get;
ADL_OVERDRIVE5_ODPERFORMANCELEVELS_GET ADL_Overdrive5_ODPerformanceLevels_Get;
ADL_OVERDRIVE5_TEMPERATURE_GET ADL_Overdrive5_Temperature_Get;
ADL_OVERDRIVE5_THERMALDEVICES_ENUM ADL_Overdrive5_ThermalDevices_Enum;
ADL_OVERDRIVE6_CAPABILITIES_GET ADL_Overdrive6_Capabilities_Get;
ADL_OVERDRIVE6_STATEINFO_GET ADL_Overdrive6_StateInfo_Get;
ADL_OVERDRIVE6_CURRENTSTATUS_GET ADL_Overdrive6_CurrentStatus_Get;
ADL_OVERDRIVE6_TARGETTEMPERATUREDATA_GET ADL_Overdrive6_TargetTemperatureData_Get;
ADL_OVERDRIVE6_TARGETTEMPERATURERANGEINFO_GET ADL_Overdrive6_TargetTemperatureRangeInfo_Get;
ADL_OVERDRIVE6_FANSPEED_GET ADL_Overdrive6_FanSpeed_Get;
ADL_OVERDRIVE6_STATEINFO_GET ADL_Overdrive6_StateInfo_Get;
ADL_OVERDRIVE6_TEMPERATURE_GET ADL_Overdrive6_Temperature_Get;
ADL_OVERDRIVE_CAPS ADL_Overdrive_Caps;
ADL2_OVERDRIVE_CAPS ADL2_Overdrive_Caps;
ADL2_NEW_QUERYPMLOGDATA_GET ADL2_New_QueryPMLogData_Get;
} hm_adl_lib_t;
@ -326,6 +656,8 @@ int hm_ADL_Overdrive_CurrentActivity_Get (void *hashcat_ctx, int iAdapterIndex,
int hm_ADL_Overdrive5_FanSpeed_Get (void *hashcat_ctx, int iAdapterIndex, int iThermalControllerIndex, ADLFanSpeedValue *lpFanSpeedValue);
int hm_ADL_Overdrive6_FanSpeed_Get (void *hashcat_ctx, int iAdapterIndex, ADLOD6FanSpeedInfo *lpFanSpeedInfo);
int hm_ADL_Overdrive_Caps (void *hashcat_ctx, int iAdapterIndex, int *od_supported, int *od_enabled, int *od_version);
int hm_ADL_Overdrive6_TargetTemperatureData_Get (void *hashcat_ctx, int iAdapterIndex, int *cur_temp, int *default_temp);
int hm_ADL2_Overdrive_Caps (void *hashcat_ctx, int iAdapterIndex, int *od_supported, int *od_enabled, int *od_version);
int hm_ADL2_New_QueryPMLogData_Get (void *hashcat_ctx, int iAdapterIndex, ADLPMLogDataOutput *lpDataOutput);
#endif // _EXT_ADL_H

@ -46,6 +46,7 @@ typedef cl_context (CL_API_CALL *OCL_CLCREATECONTEXT) (const cl_
typedef cl_kernel (CL_API_CALL *OCL_CLCREATEKERNEL) (cl_program, const char *, cl_int *);
typedef cl_program (CL_API_CALL *OCL_CLCREATEPROGRAMWITHBINARY) (cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
typedef cl_program (CL_API_CALL *OCL_CLCREATEPROGRAMWITHSOURCE) (cl_context, cl_uint, const char **, const size_t *, cl_int *);
typedef cl_int (CL_API_CALL *OCL_CLENQUEUEFILLBUFFER) (cl_command_queue, cl_mem, const void *, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
typedef cl_int (CL_API_CALL *OCL_CLENQUEUECOPYBUFFER) (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
typedef void * (CL_API_CALL *OCL_CLENQUEUEMAPBUFFER) (cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
typedef cl_int (CL_API_CALL *OCL_CLENQUEUENDRANGEKERNEL) (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
@ -87,6 +88,7 @@ typedef struct hc_opencl_lib
OCL_CLCREATEPROGRAMWITHBINARY clCreateProgramWithBinary;
OCL_CLCREATEPROGRAMWITHSOURCE clCreateProgramWithSource;
OCL_CLENQUEUECOPYBUFFER clEnqueueCopyBuffer;
OCL_CLENQUEUEFILLBUFFER clEnqueueFillBuffer;
OCL_CLENQUEUEMAPBUFFER clEnqueueMapBuffer;
OCL_CLENQUEUENDRANGEKERNEL clEnqueueNDRangeKernel;
OCL_CLENQUEUEREADBUFFER clEnqueueReadBuffer;

@ -1028,14 +1028,14 @@ typedef CUresult (CUDA_API_CALL *CUDA_CUINIT) (unsigned int);
typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL) (CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC) (CUdeviceptr *, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST) (void **, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD) (CUdeviceptr, CUdeviceptr, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH) (void *, CUdeviceptr, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD) (CUdeviceptr, const void *, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTODASYNC) (CUdeviceptr, CUdeviceptr, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOHASYNC) (void *, CUdeviceptr, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTODASYNC) (CUdeviceptr, const void *, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE) (CUdeviceptr);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST) (void *);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO) (size_t *, size_t *);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32) (CUdeviceptr, unsigned int, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8) (CUdeviceptr, unsigned char, size_t);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32ASYNC) (CUdeviceptr, unsigned int, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8ASYNC) (CUdeviceptr, unsigned char, size_t, CUstream);
typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETFUNCTION) (CUfunction *, CUmodule, const char *);
typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETGLOBAL) (CUdeviceptr *, size_t *, CUmodule, const char *);
typedef CUresult (CUDA_API_CALL *CUDA_CUMODULELOAD) (CUmodule *, const char *);
@ -1090,14 +1090,14 @@ typedef struct hc_cuda_lib
CUDA_CULAUNCHKERNEL cuLaunchKernel;
CUDA_CUMEMALLOC cuMemAlloc;
CUDA_CUMEMALLOCHOST cuMemAllocHost;
CUDA_CUMEMCPYDTOD cuMemcpyDtoD;
CUDA_CUMEMCPYDTOH cuMemcpyDtoH;
CUDA_CUMEMCPYHTOD cuMemcpyHtoD;
CUDA_CUMEMCPYDTODASYNC cuMemcpyDtoDAsync;
CUDA_CUMEMCPYDTOHASYNC cuMemcpyDtoHAsync;
CUDA_CUMEMCPYHTODASYNC cuMemcpyHtoDAsync;
CUDA_CUMEMFREE cuMemFree;
CUDA_CUMEMFREEHOST cuMemFreeHost;
CUDA_CUMEMGETINFO cuMemGetInfo;
CUDA_CUMEMSETD32 cuMemsetD32;
CUDA_CUMEMSETD8 cuMemsetD8;
CUDA_CUMEMSETD32ASYNC cuMemsetD32Async;
CUDA_CUMEMSETD8ASYNC cuMemsetD8Async;
CUDA_CUMODULEGETFUNCTION cuModuleGetFunction;
CUDA_CUMODULEGETGLOBAL cuModuleGetGlobal;
CUDA_CUMODULELOAD cuModuleLoad;

File diff suppressed because it is too large Load Diff

@ -6,41 +6,26 @@
#ifndef _EXT_HIPRTC_H
#define _EXT_HIPRTC_H
/**
* from hip_runtime.h (/opt/rocm/hip/include/hip/amd_detail/hiprtc.h)
*/
// start: amd_detail/hiprtc.h
/**
* \ingroup error
* \brief The enumerated type hiprtcResult defines API call result codes.
* HIPRTC API functions return hiprtcResult to indicate the call
* result.
*/
typedef enum {
HIPRTC_SUCCESS = 0,
HIPRTC_ERROR_OUT_OF_MEMORY = 1,
HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
HIPRTC_ERROR_INVALID_INPUT = 3,
HIPRTC_ERROR_INVALID_PROGRAM = 4,
HIPRTC_ERROR_INVALID_OPTION = 5,
HIPRTC_ERROR_COMPILATION = 6,
HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
HIPRTC_ERROR_INTERNAL_ERROR = 11
typedef enum hiprtcResult {
HIPRTC_SUCCESS = 0,
HIPRTC_ERROR_OUT_OF_MEMORY = 1,
HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
HIPRTC_ERROR_INVALID_INPUT = 3,
HIPRTC_ERROR_INVALID_PROGRAM = 4,
HIPRTC_ERROR_INVALID_OPTION = 5,
HIPRTC_ERROR_COMPILATION = 6,
HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
HIPRTC_ERROR_INTERNAL_ERROR = 11
} hiprtcResult;
/**
* \ingroup compilation
* \brief hiprtcProgram is the unit of compilation, and an opaque handle for
* a program.
*
* To compile a CUDA program string, an instance of hiprtcProgram must be
* created first with ::hiprtcCreateProgram, then compiled with
* ::hiprtcCompileProgram.
*/
typedef struct _hiprtcProgram *hiprtcProgram;
typedef struct _hiprtcProgram* hiprtcProgram;
// stop: amd_detail/hiprtc.h
#ifdef _WIN32
#define HIPRTCAPI __stdcall
@ -54,13 +39,12 @@ typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCADDNAMEEXPRESSION) (hiprtc
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCCOMPILEPROGRAM) (hiprtcProgram, int, const char * const *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCCREATEPROGRAM) (hiprtcProgram *, const char *, const char *, int, const char * const *, const char * const *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCDESTROYPROGRAM) (hiprtcProgram *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCGETCODE) (hiprtcProgram, char *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCGETCODESIZE) (hiprtcProgram, size_t *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCGETLOWEREDNAME) (hiprtcProgram, const char * const, const char **);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPTX) (hiprtcProgram, char *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPTXSIZE) (hiprtcProgram, size_t *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPROGRAMLOG) (hiprtcProgram, char *);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCGETPROGRAMLOGSIZE) (hiprtcProgram, size_t *);
typedef const char * (HIPRTC_API_CALL *HIPRTC_HIPRTCGETERRORSTRING) (hiprtcResult);
typedef hiprtcResult (HIPRTC_API_CALL *HIPRTC_HIPRTCVERSION) (int *, int *);
typedef const char * (HIPRTC_API_CALL *HIPRTC_HIPRTCGETERRORSTRING) (hiprtcResult);
typedef struct hc_hiprtc_lib
{
@ -70,13 +54,12 @@ typedef struct hc_hiprtc_lib
HIPRTC_HIPRTCCOMPILEPROGRAM hiprtcCompileProgram;
HIPRTC_HIPRTCCREATEPROGRAM hiprtcCreateProgram;
HIPRTC_HIPRTCDESTROYPROGRAM hiprtcDestroyProgram;
HIPRTC_HIPRTCGETCODE hiprtcGetCode;
HIPRTC_HIPRTCGETCODESIZE hiprtcGetCodeSize;
HIPRTC_HIPRTCGETLOWEREDNAME hiprtcGetLoweredName;
HIPRTC_HIPRTCGETPTX hiprtcGetCode;
HIPRTC_HIPRTCGETPTXSIZE hiprtcGetCodeSize;
HIPRTC_HIPRTCGETPROGRAMLOG hiprtcGetProgramLog;
HIPRTC_HIPRTCGETPROGRAMLOGSIZE hiprtcGetProgramLogSize;
HIPRTC_HIPRTCGETERRORSTRING hiprtcGetErrorString;
HIPRTC_HIPRTCVERSION hiprtcVersion;
} hc_hiprtc_lib_t;

@ -15,8 +15,8 @@
int _wopen (const char *path, int oflag, ...);
#endif
bool hc_fopen (HCFILE *fp, const char *path, char *mode);
bool hc_fopen_raw (HCFILE *fp, const char *path, char *mode);
bool hc_fopen (HCFILE *fp, const char *path, const char *mode);
bool hc_fopen_raw (HCFILE *fp, const char *path, const char *mode);
int hc_fscanf (HCFILE *fp, const char *format, void *ptr);
int hc_fprintf (HCFILE *fp, const char *format, ...);
int hc_vfprintf (HCFILE *fp, const char *format, va_list ap);

@ -15,7 +15,7 @@ int hash_encode (const hashconfig_t *hashconfig, const hashes_t *hashes, const m
int save_hash (hashcat_ctx_t *hashcat_ctx);
void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, plain_t *plain);
int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, plain_t *plain);
//int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 salt_pos);
int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param);

@ -13,7 +13,7 @@
#include <limits.h>
#include <inttypes.h>
static const int MODULE_INTERFACE_VERSION_MINIMUM = 520;
static const int MODULE_INTERFACE_VERSION_MINIMUM = 630;
static const int MODULE_HASH_MODES_MAXIMUM = 100000;

@ -11,6 +11,7 @@ void *module_benchmark_esalt (MAYBE_UNUSED const hashconfig_t *ha
void *module_benchmark_hook_salt (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
const char *module_benchmark_mask (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
salt_t *module_benchmark_salt (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
const char *module_deprecated_notice (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
bool module_dictstat_disable (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
u32 module_dgst_pos0 (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
u32 module_dgst_pos1 (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
@ -18,6 +19,7 @@ u32 module_dgst_pos2 (MAYBE_UNUSED const hashconfig_t *ha
u32 module_dgst_pos3 (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
u32 module_dgst_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
u32 module_forced_outfile_format (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
u32 module_hash_category (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);
const char *module_hash_name (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra);

@ -11,9 +11,13 @@
#define TUNING_DB_FILE "hashcat.hctune"
int sort_by_tuning_db_alias (const void *v1, const void *v2);
int sort_by_tuning_db_entry (const void *v1, const void *v2);
int tuning_db_init (hashcat_ctx_t *hashcat_ctx);
void tuning_db_destroy (hashcat_ctx_t *hashcat_ctx);
bool tuning_db_process_line (hashcat_ctx_t *hashcat_ctx, const char *line_buf, const int line_num);
tuning_db_entry_t *tuning_db_search (hashcat_ctx_t *hashcat_ctx, const char *device_name, const cl_device_type device_type, int attack_mode, const int hash_mode);
#endif // _TUNINGDB_H

@ -295,6 +295,7 @@ typedef enum rule_functions
RULE_OP_MANGLE_UREST_LFIRST = 'C',
RULE_OP_MANGLE_TREST = 't',
RULE_OP_MANGLE_TOGGLE_AT = 'T',
RULE_OP_MANGLE_TOGGLE_AT_SEP = '3',
RULE_OP_MANGLE_REVERSE = 'r',
RULE_OP_MANGLE_DUPEWORD = 'd',
RULE_OP_MANGLE_DUPEWORD_TIMES = 'p',
@ -463,6 +464,8 @@ typedef enum dgst_size
DGST_SIZE_4_32 = (32 * sizeof (u32)), // 128 !!!
DGST_SIZE_4_64 = (64 * sizeof (u32)), // 256
DGST_SIZE_8_2 = (2 * sizeof (u64)), // 16 !!!
DGST_SIZE_8_4 = (4 * sizeof (u64)), // 32 !!!
DGST_SIZE_8_6 = (6 * sizeof (u64)), // 48 !!!
DGST_SIZE_8_8 = (8 * sizeof (u64)), // 64 !!!
DGST_SIZE_8_16 = (16 * sizeof (u64)), // 128 !!!
DGST_SIZE_8_25 = (25 * sizeof (u64)) // 200
@ -612,6 +615,7 @@ typedef enum user_options_defaults
BRAIN_SESSION = 0,
#endif
DEBUG_MODE = 0,
DEPRECATED_CHECK_DISABLE = false,
FORCE = false,
HWMON_DISABLE = false,
#if defined (__APPLE__)
@ -651,6 +655,7 @@ typedef enum user_options_defaults
BACKEND_INFO = false,
BACKEND_VECTOR_WIDTH = 0,
OPTIMIZED_KERNEL_ENABLE = false,
MULTIPLY_ACCEL_DISABLE = false,
OUTFILE_AUTOHEX = true,
OUTFILE_CHECK_TIMER = 5,
OUTFILE_FORMAT = 3,
@ -670,7 +675,6 @@ typedef enum user_options_defaults
SCRYPT_TMTO = 0,
SEGMENT_SIZE = 33554432,
SELF_TEST_DISABLE = false,
SEPARATOR = ':',
SHOW = false,
SKIP = 0,
SLOW_CANDIDATES = false,
@ -723,88 +727,90 @@ typedef enum user_options_map
IDX_CUSTOM_CHARSET_4 = '4',
IDX_DEBUG_FILE = 0xff11,
IDX_DEBUG_MODE = 0xff12,
IDX_ENCODING_FROM = 0xff13,
IDX_ENCODING_TO = 0xff14,
IDX_HASH_INFO = 0xff15,
IDX_FORCE = 0xff16,
IDX_HWMON_DISABLE = 0xff17,
IDX_HWMON_TEMP_ABORT = 0xff18,
IDX_DEPRECATED_CHECK_DISABLE = 0xff13,
IDX_ENCODING_FROM = 0xff14,
IDX_ENCODING_TO = 0xff15,
IDX_HASH_INFO = 0xff16,
IDX_FORCE = 0xff17,
IDX_HWMON_DISABLE = 0xff18,
IDX_HWMON_TEMP_ABORT = 0xff19,
IDX_HASH_MODE = 'm',
IDX_HCCAPX_MESSAGE_PAIR = 0xff19,
IDX_HCCAPX_MESSAGE_PAIR = 0xff1a,
IDX_HELP = 'h',
IDX_HEX_CHARSET = 0xff1a,
IDX_HEX_SALT = 0xff1b,
IDX_HEX_WORDLIST = 0xff1c,
IDX_HOOK_THREADS = 0xff1d,
IDX_IDENTIFY = 0xff1e,
IDX_HEX_CHARSET = 0xff1b,
IDX_HEX_SALT = 0xff1c,
IDX_HEX_WORDLIST = 0xff1d,
IDX_HOOK_THREADS = 0xff1e,
IDX_IDENTIFY = 0xff1f,
IDX_INCREMENT = 'i',
IDX_INCREMENT_MAX = 0xff1f,
IDX_INCREMENT_MIN = 0xff20,
IDX_INDUCTION_DIR = 0xff21,
IDX_KEEP_GUESSING = 0xff22,
IDX_INCREMENT_MAX = 0xff20,
IDX_INCREMENT_MIN = 0xff21,
IDX_INDUCTION_DIR = 0xff22,
IDX_KEEP_GUESSING = 0xff23,
IDX_KERNEL_ACCEL = 'n',
IDX_KERNEL_LOOPS = 'u',
IDX_KERNEL_THREADS = 'T',
IDX_KEYBOARD_LAYOUT_MAPPING = 0xff23,
IDX_KEYSPACE = 0xff24,
IDX_LEFT = 0xff25,
IDX_KEYBOARD_LAYOUT_MAPPING = 0xff24,
IDX_KEYSPACE = 0xff25,
IDX_LEFT = 0xff26,
IDX_LIMIT = 'l',
IDX_LOGFILE_DISABLE = 0xff26,
IDX_LOOPBACK = 0xff27,
IDX_MACHINE_READABLE = 0xff28,
IDX_MARKOV_CLASSIC = 0xff29,
IDX_MARKOV_DISABLE = 0xff2a,
IDX_MARKOV_HCSTAT2 = 0xff2b,
IDX_MARKOV_INVERSE = 0xff2c,
IDX_LOGFILE_DISABLE = 0xff27,
IDX_LOOPBACK = 0xff28,
IDX_MACHINE_READABLE = 0xff29,
IDX_MARKOV_CLASSIC = 0xff2a,
IDX_MARKOV_DISABLE = 0xff2b,
IDX_MARKOV_HCSTAT2 = 0xff2c,
IDX_MARKOV_INVERSE = 0xff2d,
IDX_MARKOV_THRESHOLD = 't',
IDX_NONCE_ERROR_CORRECTIONS = 0xff2d,
IDX_NONCE_ERROR_CORRECTIONS = 0xff2e,
IDX_OPENCL_DEVICE_TYPES = 'D',
IDX_OPTIMIZED_KERNEL_ENABLE = 'O',
IDX_OUTFILE_AUTOHEX_DISABLE = 0xff2e,
IDX_OUTFILE_CHECK_DIR = 0xff2f,
IDX_OUTFILE_CHECK_TIMER = 0xff30,
IDX_OUTFILE_FORMAT = 0xff31,
IDX_MULTIPLY_ACCEL_DISABLE = 'M',
IDX_OUTFILE_AUTOHEX_DISABLE = 0xff2f,
IDX_OUTFILE_CHECK_DIR = 0xff30,
IDX_OUTFILE_CHECK_TIMER = 0xff31,
IDX_OUTFILE_FORMAT = 0xff32,
IDX_OUTFILE = 'o',
IDX_POTFILE_DISABLE = 0xff32,
IDX_POTFILE_PATH = 0xff33,
IDX_PROGRESS_ONLY = 0xff34,
IDX_QUIET = 0xff35,
IDX_REMOVE = 0xff36,
IDX_REMOVE_TIMER = 0xff37,
IDX_RESTORE = 0xff38,
IDX_RESTORE_DISABLE = 0xff39,
IDX_RESTORE_FILE_PATH = 0xff3a,
IDX_POTFILE_DISABLE = 0xff33,
IDX_POTFILE_PATH = 0xff34,
IDX_PROGRESS_ONLY = 0xff35,
IDX_QUIET = 0xff36,
IDX_REMOVE = 0xff37,
IDX_REMOVE_TIMER = 0xff38,
IDX_RESTORE = 0xff39,
IDX_RESTORE_DISABLE = 0xff3a,
IDX_RESTORE_FILE_PATH = 0xff3b,
IDX_RP_FILE = 'r',
IDX_RP_GEN_FUNC_MAX = 0xff3b,
IDX_RP_GEN_FUNC_MIN = 0xff3c,
IDX_RP_GEN_FUNC_MAX = 0xff3c,
IDX_RP_GEN_FUNC_MIN = 0xff3d,
IDX_RP_GEN = 'g',
IDX_RP_GEN_SEED = 0xff3d,
IDX_RP_GEN_SEED = 0xff3e,
IDX_RULE_BUF_L = 'j',
IDX_RULE_BUF_R = 'k',
IDX_RUNTIME = 0xff3e,
IDX_SCRYPT_TMTO = 0xff3f,
IDX_RUNTIME = 0xff3f,
IDX_SCRYPT_TMTO = 0xff40,
IDX_SEGMENT_SIZE = 'c',
IDX_SELF_TEST_DISABLE = 0xff40,
IDX_SELF_TEST_DISABLE = 0xff41,
IDX_SEPARATOR = 'p',
IDX_SESSION = 0xff41,
IDX_SHOW = 0xff42,
IDX_SESSION = 0xff42,
IDX_SHOW = 0xff43,
IDX_SKIP = 's',
IDX_SLOW_CANDIDATES = 'S',
IDX_SPEED_ONLY = 0xff43,
IDX_SPIN_DAMP = 0xff44,
IDX_STATUS = 0xff45,
IDX_STATUS_JSON = 0xff46,
IDX_STATUS_TIMER = 0xff47,
IDX_STDOUT_FLAG = 0xff48,
IDX_STDIN_TIMEOUT_ABORT = 0xff49,
IDX_TRUECRYPT_KEYFILES = 0xff4a,
IDX_USERNAME = 0xff4b,
IDX_VERACRYPT_KEYFILES = 0xff4c,
IDX_VERACRYPT_PIM_START = 0xff4d,
IDX_VERACRYPT_PIM_STOP = 0xff4e,
IDX_SPEED_ONLY = 0xff44,
IDX_SPIN_DAMP = 0xff45,
IDX_STATUS = 0xff46,
IDX_STATUS_JSON = 0xff47,
IDX_STATUS_TIMER = 0xff48,
IDX_STDOUT_FLAG = 0xff49,
IDX_STDIN_TIMEOUT_ABORT = 0xff4a,
IDX_TRUECRYPT_KEYFILES = 0xff4b,
IDX_USERNAME = 0xff4c,
IDX_VERACRYPT_KEYFILES = 0xff4d,
IDX_VERACRYPT_PIM_START = 0xff4e,
IDX_VERACRYPT_PIM_STOP = 0xff4f,
IDX_VERSION_LOWER = 'v',
IDX_VERSION = 'V',
IDX_WORDLIST_AUTOHEX_DISABLE = 0xff4f,
IDX_WORDLIST_AUTOHEX_DISABLE = 0xff50,
IDX_WORKLOAD_PROFILE = 'w',
} user_options_map_t;
@ -925,7 +931,6 @@ typedef struct hashes
void *digests_buf;
u32 *digests_shown;
u32 *digests_shown_tmp;
u32 salts_cnt;
u32 salts_done;
@ -1074,7 +1079,7 @@ typedef struct hc_fp
bool is_zip;
int bom_size;
char *mode;
const char *mode;
const char *path;
} HCFILE;
@ -1116,6 +1121,8 @@ typedef struct hc_device_param
int sm_minor;
u32 kernel_exec_timeout;
u32 kernel_preferred_wgs_multiple;
st_status_t st_status;
int vector_width;
@ -1137,6 +1144,7 @@ typedef struct hc_device_param
u32 kernel_wgs_amp;
u32 kernel_wgs_tm;
u32 kernel_wgs_memset;
u32 kernel_wgs_bzero;
u32 kernel_wgs_atinit;
u32 kernel_wgs_utf8toutf16le;
u32 kernel_wgs_decompress;
@ -1162,6 +1170,7 @@ typedef struct hc_device_param
u32 kernel_preferred_wgs_multiple_amp;
u32 kernel_preferred_wgs_multiple_tm;
u32 kernel_preferred_wgs_multiple_memset;
u32 kernel_preferred_wgs_multiple_bzero;
u32 kernel_preferred_wgs_multiple_atinit;
u32 kernel_preferred_wgs_multiple_utf8toutf16le;
u32 kernel_preferred_wgs_multiple_decompress;
@ -1187,6 +1196,7 @@ typedef struct hc_device_param
u64 kernel_local_mem_size_amp;
u64 kernel_local_mem_size_tm;
u64 kernel_local_mem_size_memset;
u64 kernel_local_mem_size_bzero;
u64 kernel_local_mem_size_atinit;
u64 kernel_local_mem_size_utf8toutf16le;
u64 kernel_local_mem_size_decompress;
@ -1212,6 +1222,7 @@ typedef struct hc_device_param
u64 kernel_dynamic_local_mem_size_amp;
u64 kernel_dynamic_local_mem_size_tm;
u64 kernel_dynamic_local_mem_size_memset;
u64 kernel_dynamic_local_mem_size_bzero;
u64 kernel_dynamic_local_mem_size_atinit;
u64 kernel_dynamic_local_mem_size_utf8toutf16le;
u64 kernel_dynamic_local_mem_size_decompress;
@ -1373,6 +1384,7 @@ typedef struct hc_device_param
void *kernel_params_amp[PARAMCNT];
void *kernel_params_tm[PARAMCNT];
void *kernel_params_memset[PARAMCNT];
void *kernel_params_bzero[PARAMCNT];
void *kernel_params_atinit[PARAMCNT];
void *kernel_params_utf8toutf16le[PARAMCNT];
void *kernel_params_decompress[PARAMCNT];
@ -1395,6 +1407,9 @@ typedef struct hc_device_param
u32 kernel_params_memset_buf32[PARAMCNT];
u64 kernel_params_memset_buf64[PARAMCNT];
u32 kernel_params_bzero_buf32[PARAMCNT];
u64 kernel_params_bzero_buf64[PARAMCNT];
u32 kernel_params_atinit_buf32[PARAMCNT];
u64 kernel_params_atinit_buf64[PARAMCNT];
@ -1416,6 +1431,7 @@ typedef struct hc_device_param
CUevent cuda_event1;
CUevent cuda_event2;
CUevent cuda_event3;
CUmodule cuda_module;
CUmodule cuda_module_shared;
@ -1439,6 +1455,7 @@ typedef struct hc_device_param
CUfunction cuda_function_amp;
CUfunction cuda_function_tm;
CUfunction cuda_function_memset;
CUfunction cuda_function_bzero;
CUfunction cuda_function_atinit;
CUfunction cuda_function_utf8toutf16le;
CUfunction cuda_function_decompress;
@ -1490,79 +1507,81 @@ typedef struct hc_device_param
int hip_warp_size;
HIPdevice hip_device;
HIPcontext hip_context;
HIPstream hip_stream;
HIPevent hip_event1;
HIPevent hip_event2;
HIPmodule hip_module;
HIPmodule hip_module_shared;
HIPmodule hip_module_mp;
HIPmodule hip_module_amp;
HIPfunction hip_function1;
HIPfunction hip_function12;
HIPfunction hip_function2p;
HIPfunction hip_function2;
HIPfunction hip_function2e;
HIPfunction hip_function23;
HIPfunction hip_function3;
HIPfunction hip_function4;
HIPfunction hip_function_init2;
HIPfunction hip_function_loop2p;
HIPfunction hip_function_loop2;
HIPfunction hip_function_mp;
HIPfunction hip_function_mp_l;
HIPfunction hip_function_mp_r;
HIPfunction hip_function_amp;
HIPfunction hip_function_tm;
HIPfunction hip_function_memset;
HIPfunction hip_function_atinit;
HIPfunction hip_function_utf8toutf16le;
HIPfunction hip_function_decompress;
HIPfunction hip_function_aux1;
HIPfunction hip_function_aux2;
HIPfunction hip_function_aux3;
HIPfunction hip_function_aux4;
HIPdeviceptr hip_d_pws_buf;
HIPdeviceptr hip_d_pws_amp_buf;
HIPdeviceptr hip_d_pws_comp_buf;
HIPdeviceptr hip_d_pws_idx;
HIPdeviceptr hip_d_rules;
HIPdeviceptr hip_d_rules_c;
HIPdeviceptr hip_d_combs;
HIPdeviceptr hip_d_combs_c;
HIPdeviceptr hip_d_bfs;
HIPdeviceptr hip_d_bfs_c;
HIPdeviceptr hip_d_tm_c;
HIPdeviceptr hip_d_bitmap_s1_a;
HIPdeviceptr hip_d_bitmap_s1_b;
HIPdeviceptr hip_d_bitmap_s1_c;
HIPdeviceptr hip_d_bitmap_s1_d;
HIPdeviceptr hip_d_bitmap_s2_a;
HIPdeviceptr hip_d_bitmap_s2_b;
HIPdeviceptr hip_d_bitmap_s2_c;
HIPdeviceptr hip_d_bitmap_s2_d;
HIPdeviceptr hip_d_plain_bufs;
HIPdeviceptr hip_d_digests_buf;
HIPdeviceptr hip_d_digests_shown;
HIPdeviceptr hip_d_salt_bufs;
HIPdeviceptr hip_d_esalt_bufs;
HIPdeviceptr hip_d_tmps;
HIPdeviceptr hip_d_hooks;
HIPdeviceptr hip_d_result;
HIPdeviceptr hip_d_extra0_buf;
HIPdeviceptr hip_d_extra1_buf;
HIPdeviceptr hip_d_extra2_buf;
HIPdeviceptr hip_d_extra3_buf;
HIPdeviceptr hip_d_root_css_buf;
HIPdeviceptr hip_d_markov_css_buf;
HIPdeviceptr hip_d_st_digests_buf;
HIPdeviceptr hip_d_st_salts_buf;
HIPdeviceptr hip_d_st_esalts_buf;
hipDevice_t hip_device;
hipCtx_t hip_context;
hipStream_t hip_stream;
hipEvent_t hip_event1;
hipEvent_t hip_event2;
hipEvent_t hip_event3;
hipModule_t hip_module;
hipModule_t hip_module_shared;
hipModule_t hip_module_mp;
hipModule_t hip_module_amp;
hipFunction_t hip_function1;
hipFunction_t hip_function12;
hipFunction_t hip_function2p;
hipFunction_t hip_function2;
hipFunction_t hip_function2e;
hipFunction_t hip_function23;
hipFunction_t hip_function3;
hipFunction_t hip_function4;
hipFunction_t hip_function_init2;
hipFunction_t hip_function_loop2p;
hipFunction_t hip_function_loop2;
hipFunction_t hip_function_mp;
hipFunction_t hip_function_mp_l;
hipFunction_t hip_function_mp_r;
hipFunction_t hip_function_amp;
hipFunction_t hip_function_tm;
hipFunction_t hip_function_memset;
hipFunction_t hip_function_bzero;
hipFunction_t hip_function_atinit;
hipFunction_t hip_function_utf8toutf16le;
hipFunction_t hip_function_decompress;
hipFunction_t hip_function_aux1;
hipFunction_t hip_function_aux2;
hipFunction_t hip_function_aux3;
hipFunction_t hip_function_aux4;
hipDeviceptr_t hip_d_pws_buf;
hipDeviceptr_t hip_d_pws_amp_buf;
hipDeviceptr_t hip_d_pws_comp_buf;
hipDeviceptr_t hip_d_pws_idx;
hipDeviceptr_t hip_d_rules;
hipDeviceptr_t hip_d_rules_c;
hipDeviceptr_t hip_d_combs;
hipDeviceptr_t hip_d_combs_c;
hipDeviceptr_t hip_d_bfs;
hipDeviceptr_t hip_d_bfs_c;
hipDeviceptr_t hip_d_tm_c;
hipDeviceptr_t hip_d_bitmap_s1_a;
hipDeviceptr_t hip_d_bitmap_s1_b;
hipDeviceptr_t hip_d_bitmap_s1_c;
hipDeviceptr_t hip_d_bitmap_s1_d;
hipDeviceptr_t hip_d_bitmap_s2_a;
hipDeviceptr_t hip_d_bitmap_s2_b;
hipDeviceptr_t hip_d_bitmap_s2_c;
hipDeviceptr_t hip_d_bitmap_s2_d;
hipDeviceptr_t hip_d_plain_bufs;
hipDeviceptr_t hip_d_digests_buf;
hipDeviceptr_t hip_d_digests_shown;
hipDeviceptr_t hip_d_salt_bufs;
hipDeviceptr_t hip_d_esalt_bufs;
hipDeviceptr_t hip_d_tmps;
hipDeviceptr_t hip_d_hooks;
hipDeviceptr_t hip_d_result;
hipDeviceptr_t hip_d_extra0_buf;
hipDeviceptr_t hip_d_extra1_buf;
hipDeviceptr_t hip_d_extra2_buf;
hipDeviceptr_t hip_d_extra3_buf;
hipDeviceptr_t hip_d_root_css_buf;
hipDeviceptr_t hip_d_markov_css_buf;
hipDeviceptr_t hip_d_st_digests_buf;
hipDeviceptr_t hip_d_st_salts_buf;
hipDeviceptr_t hip_d_st_esalts_buf;
// API: opencl
@ -1604,6 +1623,7 @@ typedef struct hc_device_param
cl_kernel opencl_kernel_amp;
cl_kernel opencl_kernel_tm;
cl_kernel opencl_kernel_memset;
cl_kernel opencl_kernel_bzero;
cl_kernel opencl_kernel_atinit;
cl_kernel opencl_kernel_utf8toutf16le;
cl_kernel opencl_kernel_decompress;
@ -1712,8 +1732,8 @@ typedef struct backend_ctx
int rc_hip_init;
int rc_hiprtc_init;
int hiprtc_driver_version;
int hip_driver_version;
int hip_runtimeVersion;
int hip_driverVersion;
// opencl
@ -1785,8 +1805,6 @@ typedef struct hwmon_ctx
hm_attrs_t *hm_device;
ADLOD6MemClockState *od_clock_mem_status;
} hwmon_ctx_t;
#if defined (__APPLE__)
@ -2031,9 +2049,11 @@ typedef struct tuning_db
tuning_db_alias_t *alias_buf;
int alias_cnt;
int alias_alloc;
tuning_db_entry_t *entry_buf;
int entry_cnt;
int entry_alloc;
} tuning_db_t;
@ -2090,6 +2110,7 @@ typedef struct user_options
bool skip_chgd;
bool limit_chgd;
bool scrypt_tmto_chgd;
bool separator_chgd;
bool advice_disable;
bool benchmark;
@ -2099,6 +2120,7 @@ typedef struct user_options
bool brain_server;
#endif
bool force;
bool deprecated_check_disable;
bool hwmon_disable;
bool hash_info;
bool hex_charset;
@ -2119,6 +2141,7 @@ typedef struct user_options
bool backend_ignore_opencl;
bool backend_info;
bool optimized_kernel_enable;
bool multiply_accel_disable;
bool outfile_autohex;
bool potfile_disable;
bool progress_only;
@ -2158,7 +2181,7 @@ typedef struct user_options
char *potfile_path;
char *restore_file_path;
char **rp_files;
char separator;
char *separator;
char *truecrypt_keyfiles;
char *veracrypt_keyfiles;
const char *custom_charset_1;
@ -2224,6 +2247,8 @@ typedef struct user_options_extra
u32 wordlist_mode;
char separator;
char *hc_hash; // can be filename or string
int hc_workc; // can be 0 in bf-mode = default mask
@ -2626,13 +2651,15 @@ typedef struct module_ctx
void *(*module_benchmark_hook_salt) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
const char *(*module_benchmark_mask) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
salt_t *(*module_benchmark_salt) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
bool (*module_dictstat_disable) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
const char *(*module_deprecated_notice) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u32 (*module_dgst_pos0) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u32 (*module_dgst_pos1) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u32 (*module_dgst_pos2) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u32 (*module_dgst_pos3) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u32 (*module_dgst_size) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
bool (*module_dictstat_disable) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u64 (*module_esalt_size) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
const char *(*module_extra_tuningdb_block) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u32 (*module_forced_outfile_format) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
u32 (*module_hash_category) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);
const char *(*module_hash_name) (const hashconfig_t *, const user_options_t *, const user_options_extra_t *);

@ -53,7 +53,7 @@ endif
## Do not modify
##
MODULE_INTERFACE_VERSION := 620
MODULE_INTERFACE_VERSION := 630
##
## Native compiler paths
@ -203,12 +203,11 @@ endif
## because UNRAR
ifeq ($(ENABLE_UNRAR),1)
ifeq ($(USE_SYSTEM_UNRAR),0)
ifneq ($(UNAME),Darwin)
CFLAGS_UNRAR += -Wno-misleading-indentation
ifneq ($(CC),clang)
CFLAGS_UNRAR += -Wno-class-memaccess
else
CFLAGS_UNRAR += -Wno-missing-braces
CFLAGS_UNRAR += -Wno-misleading-indentation
endif
CFLAGS_UNRAR += -Wno-missing-braces
CFLAGS_UNRAR += -Wno-unused-variable
CFLAGS_UNRAR += -Wno-unused-parameter
CFLAGS_UNRAR += -Wno-unused-function

@ -10,7 +10,7 @@
#include "status.h"
#include "autotune.h"
static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads)
{
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
user_options_t *user_options = hashcat_ctx->user_options;
@ -19,7 +19,9 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set
device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
u32 kernel_power_try = device_param->hardware_power * kernel_accel;
const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads;
u32 kernel_power_try = hardware_power * kernel_accel;
if (user_options->attack_mode == ATTACK_MODE_ASSOCIATION)
{
@ -33,6 +35,10 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
}
}
const u32 kernel_threads_sav = device_param->kernel_threads;
device_param->kernel_threads = kernel_threads;
const double spin_damp_sav = device_param->spin_damp;
device_param->spin_damp = 0;
@ -55,59 +61,46 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
device_param->spin_damp = spin_damp_sav;
device_param->kernel_threads = kernel_threads_sav;
const double exec_msec_prev = get_avg_exec_time (device_param, 1);
return exec_msec_prev;
}
/*
static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads, const int times)
{
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
device_param->kernel_params_buf32[28] = 0;
device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set
device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
const u32 kernel_power_try = device_param->hardware_power * kernel_accel;
const u32 kernel_threads_sav = device_param->kernel_threads;
const double spin_damp_sav = device_param->spin_damp;
double exec_msec_best = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
device_param->spin_damp = 0;
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
for (int i = 1; i < times; i++)
{
if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
{
device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple1;
double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_try, true, 0);
}
else
{
device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple4;
if (exec_msec > exec_msec_best) continue;
run_kernel (hashcat_ctx, device_param, KERN_RUN_4, 0, kernel_power_try, true, 0);
}
exec_msec_best = exec_msec;
}
else
{
device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple2;
run_kernel (hashcat_ctx, device_param, KERN_RUN_2, 0, kernel_power_try, true, 0);
}
return exec_msec_best;
}
device_param->kernel_threads = kernel_threads_sav;
static u32 previous_power_of_two (const u32 x)
{
// https://stackoverflow.com/questions/2679815/previous-power-of-2
// really cool!
device_param->spin_damp = spin_damp_sav;
if (x == 0) return 0;
const double exec_msec_prev = get_avg_exec_time (device_param, 1);
u32 r = x;
return exec_msec_prev;
r |= (r >> 1);
r |= (r >> 2);
r |= (r >> 4);
r |= (r >> 8);
r |= (r >> 16);
return r - (r >> 1);
}
*/
static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
{
@ -124,9 +117,57 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
const u32 kernel_loops_min = device_param->kernel_loops_min;
const u32 kernel_loops_max = device_param->kernel_loops_max;
const u32 kernel_threads_min = device_param->kernel_threads_min;
const u32 kernel_threads_max = device_param->kernel_threads_max;
u32 kernel_accel = kernel_accel_min;
u32 kernel_loops = kernel_loops_min;
// for the threads we take as initial value what we receive from the runtime
// but is only to start with something, we will fine tune this value as soon as we have our workload specified
// this thread limiting is also performed insinde run_kernel() so we need to redo it here, too
u32 kernel_wgs = 0;
u32 kernel_wgs_multiple = 0;
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
{
kernel_wgs = device_param->kernel_wgs1;
kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple1;
}
else
{
kernel_wgs = device_param->kernel_wgs4;
kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple4;
}
}
else
{
kernel_wgs = device_param->kernel_wgs2;
kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple2;
}
u32 kernel_threads = kernel_threads_max;
if ((kernel_wgs >= kernel_threads_min) && (kernel_wgs <= kernel_threads_max))
{
kernel_threads = kernel_wgs;
}
// having a value power of 2 makes it easier to divide
const u32 kernel_threads_p2 = previous_power_of_two (kernel_threads);
if ((kernel_threads_p2 >= kernel_threads_min) && (kernel_threads_p2 <= kernel_threads_max))
{
kernel_threads = kernel_threads_p2;
}
// in this case the user specified a fixed -n and -u on the commandline
// no way to tune anything
// but we need to run a few caching rounds
@ -142,10 +183,10 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
if (hashconfig->warmup_disable == false)
{
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads);
}
#endif
@ -157,29 +198,19 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
const u32 kernel_power_max = device_param->hardware_power * kernel_accel_max;
int CU_rc;
int HIP_rc;
int CL_rc;
if (device_param->is_cuda == true)
{
CU_rc = run_cuda_kernel_atinit (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, kernel_power_max);
if (CU_rc == -1) return -1;
if (run_cuda_kernel_atinit (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, kernel_power_max) == -1) return -1;
}
if (device_param->is_hip == true)
{
HIP_rc = run_hip_kernel_atinit (hashcat_ctx, device_param, device_param->hip_d_pws_buf, kernel_power_max);
if (HIP_rc == -1) return -1;
if (run_hip_kernel_atinit (hashcat_ctx, device_param, device_param->hip_d_pws_buf, kernel_power_max) == -1) return -1;
}
if (device_param->is_opencl == true)
{
CL_rc = run_opencl_kernel_atinit (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, kernel_power_max);
if (CL_rc == -1) return -1;
if (run_opencl_kernel_atinit (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, kernel_power_max) == -1) return -1;
}
if (user_options->slow_candidates == true)
@ -193,35 +224,53 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
{
if (device_param->is_cuda == true)
{
CU_rc = hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t));
if (CU_rc == -1) return -1;
if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->cuda_stream) == -1) return -1;
}
if (device_param->is_hip == true)
{
HIP_rc = hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t));
if (HIP_rc == -1) return -1;
if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->hip_stream) == -1) return -1;
}
if (device_param->is_opencl == true)
{
CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, device_param->opencl_d_rules_c, 0, 0, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), 0, NULL, NULL);
if (CL_rc == -1) return -1;
if (hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, device_param->opencl_d_rules_c, 0, 0, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), 0, NULL, NULL) == -1) return -1;
}
}
}
}
// we also need to initialize some values using kernels
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
// nothing to do
}
else
{
const u32 kernel_threads_sav = device_param->kernel_threads;
device_param->kernel_threads = device_param->kernel_wgs1;
run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_max, false, 0);
if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
{
device_param->kernel_threads = device_param->kernel_wgs2p;
run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_max, false, 0);
}
device_param->kernel_threads = kernel_threads_sav;
}
// Do a pre-autotune test run to find out if kernel runtime is above some TDR limit
u32 kernel_loops_max_reduced = kernel_loops_max;
if (true)
{
double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min);
double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads);
if (exec_msec > 2000)
{
@ -230,7 +279,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
return -1;
}
exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min);
exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads);
const u32 mm = kernel_loops_max / kernel_loops_min;
@ -250,16 +299,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
{
if (kernel_loops > kernel_loops_max_reduced) continue;
double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops);
double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops, kernel_threads, 1);
if (exec_msec < target_msec) break;
}
}
// now the same for kernel-accel but with the new kernel-loops from previous loop set
#define STEPS_CNT 16
// now the same for kernel-accel but with the new kernel-loops from previous loop set
if (kernel_accel_min < kernel_accel_max)
{
for (int i = 0; i < STEPS_CNT; i++)
@ -269,7 +318,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
if (kernel_accel_try < kernel_accel_min) continue;
if (kernel_accel_try > kernel_accel_max) break;
double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops);
double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads, 1);
if (exec_msec > target_msec) break;
@ -285,7 +334,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
const u32 kernel_accel_orig = kernel_accel;
const u32 kernel_loops_orig = kernel_loops;
double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1);
for (int i = 1; i < STEPS_CNT; i++)
{
@ -300,7 +349,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
// do a real test
const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try);
const double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try, kernel_threads, 1);
if (exec_msec_prev < exec_msec) break;
@ -317,7 +366,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
}
}
double exec_msec_pre_final = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
double exec_msec_pre_final = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1);
const u32 exec_left = (const u32) (target_msec / exec_msec_pre_final);
@ -332,46 +381,43 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
kernel_accel *= exec_accel_min;
}
// start finding best thread count is easier.
// it's either the preferred or the maximum thread count
// v6.2.4 new section: find thread count
// This is not as effective as it could be because of inaccurate kernel return timers
// But is better than fixed values
// Timers in this section are critical, so we rerun meassurements 3 times
/*
const u32 kernel_threads_min = device_param->kernel_threads_min;
const u32 kernel_threads_max = device_param->kernel_threads_max;
if (kernel_threads_min < kernel_threads_max)
if (kernel_threads_max > kernel_threads_min)
{
const double exec_msec_max = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops);
const u32 kernel_accel_orig = kernel_accel;
const u32 kernel_threads_orig = kernel_threads;
u32 preferred_threads = 0;
double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 3);
if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
{
preferred_threads = device_param->kernel_preferred_wgs_multiple1;
}
else
{
preferred_threads = device_param->kernel_preferred_wgs_multiple4;
}
}
else
for (int i = 1; i < STEPS_CNT; i++)
{
preferred_threads = device_param->kernel_preferred_wgs_multiple2;
}
const u32 kernel_accel_try = kernel_accel_orig * (1U << i);
const u32 kernel_threads_try = kernel_threads_orig / (1U << i);
if ((preferred_threads >= kernel_threads_min) && (preferred_threads <= kernel_threads_max))
{
const double exec_msec_preferred = try_run_preferred (hashcat_ctx, device_param, kernel_accel, kernel_loops);
// since we do not modify total amount of workitems, we can (and need) to do increase kernel_accel_max
if (exec_msec_preferred < exec_msec_max)
{
device_param->kernel_threads = preferred_threads;
}
const u32 kernel_accel_max_try = kernel_accel_max * (1U << i);
if (kernel_accel_try > kernel_accel_max_try) break;
if (kernel_threads_try < kernel_threads_min) break;
if (kernel_threads_try % kernel_wgs_multiple) break; // this would just be waste of time
double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads_try, 3);
if (exec_msec > exec_msec_prev) continue;
exec_msec_prev = exec_msec;
kernel_accel = kernel_accel_try;
kernel_threads = kernel_threads_try;
}
}
*/
}
// reset them fake words
@ -379,65 +425,43 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
if (device_param->is_cuda == true)
{
int CU_rc;
CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, 0, device_param->size_pws);
if (CU_rc == -1) return -1;
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_pws_buf, device_param->size_pws) == -1) return -1;
CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_plain_bufs, 0, device_param->size_plains);
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_plain_bufs, device_param->size_plains) == -1) return -1;
if (CU_rc == -1) return -1;
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_digests_shown, device_param->size_shown) == -1) return -1;
CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_digests_shown, 0, device_param->size_shown);
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_result, device_param->size_results) == -1) return -1;
if (CU_rc == -1) return -1;
CU_rc = run_cuda_kernel_memset (hashcat_ctx, device_param, device_param->cuda_d_result, 0, device_param->size_results);
if (CU_rc == -1) return -1;
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_tmps, device_param->size_tmps) == -1) return -1;
}
if (device_param->is_hip == true)
{
int HIP_rc;
HIP_rc = run_hip_kernel_memset (hashcat_ctx, device_param, device_param->hip_d_pws_buf, 0, device_param->size_pws);
if (HIP_rc == -1) return -1;
if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_pws_buf, device_param->size_pws) == -1) return -1;
HIP_rc = run_hip_kernel_memset (hashcat_ctx, device_param, device_param->hip_d_plain_bufs, 0, device_param->size_plains);
if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_plain_bufs, device_param->size_plains) == -1) return -1;
if (HIP_rc == -1) return -1;
if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_digests_shown, device_param->size_shown) == -1) return -1;
HIP_rc = run_hip_kernel_memset (hashcat_ctx, device_param, device_param->hip_d_digests_shown, 0, device_param->size_shown);
if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_result, device_param->size_results) == -1) return -1;
if (HIP_rc == -1) return -1;
HIP_rc = run_hip_kernel_memset (hashcat_ctx, device_param, device_param->hip_d_result, 0, device_param->size_results);
if (HIP_rc == -1) return -1;
if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_tmps, device_param->size_tmps) == -1) return -1;
}
if (device_param->is_opencl == true)
{
int CL_rc;
CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, 0, device_param->size_pws);
if (CL_rc == -1) return -1;
if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_pws_buf, device_param->size_pws) == -1) return -1;
CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_plain_bufs, 0, device_param->size_plains);
if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_plain_bufs, device_param->size_plains) == -1) return -1;
if (CL_rc == -1) return -1;
if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_digests_shown, device_param->size_shown) == -1) return -1;
CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_digests_shown, 0, device_param->size_shown);
if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_result, device_param->size_results) == -1) return -1;
if (CL_rc == -1) return -1;
if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_tmps, device_param->size_tmps) == -1) return -1;
CL_rc = run_opencl_kernel_memset (hashcat_ctx, device_param, device_param->opencl_d_result, 0, device_param->size_results);
if (CL_rc == -1) return -1;
if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
}
// reset timer
@ -459,8 +483,13 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
// store
device_param->kernel_accel = kernel_accel;
device_param->kernel_loops = kernel_loops;
device_param->kernel_accel = kernel_accel;
device_param->kernel_loops = kernel_loops;
device_param->kernel_threads = kernel_threads;
const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads;
device_param->hardware_power = hardware_power;
const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel;

File diff suppressed because it is too large Load Diff

@ -172,6 +172,14 @@ u32 brain_compute_attack (hashcat_ctx_t *hashcat_ctx)
XXH64_update (state, &hex_salt, sizeof (hex_salt));
const u32 opti_type = hashconfig->opti_type;
XXH64_update (state, &opti_type, sizeof (opti_type));
const u64 opts_type = hashconfig->opts_type;
XXH64_update (state, &opts_type, sizeof (opts_type));
const int hccapx_message_pair = user_options->hccapx_message_pair;
XXH64_update (state, &hccapx_message_pair, sizeof (hccapx_message_pair));
@ -658,7 +666,7 @@ u32 brain_auth_challenge (void)
#else
static const char *urandom = "/dev/urandom";
static const char *const urandom = "/dev/urandom";
HCFILE fp;

@ -50,27 +50,26 @@ int adl_init (void *hashcat_ctx)
return -1;
}
HC_LOAD_FUNC(adl, ADL_Main_Control_Destroy, ADL_MAIN_CONTROL_DESTROY, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Main_Control_Create, ADL_MAIN_CONTROL_CREATE, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Adapter_NumberOfAdapters_Get, ADL_ADAPTER_NUMBEROFADAPTERS_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Adapter_Active_Get, ADL_ADAPTER_ACTIVE_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Adapter_AdapterInfo_Get, ADL_ADAPTER_ADAPTERINFO_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Display_DisplayInfo_Get, ADL_DISPLAY_DISPLAYINFO_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Adapter_ID_Get, ADL_ADAPTER_ID_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Adapter_VideoBiosInfo_Get, ADL_ADAPTER_VIDEOBIOSINFO_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_ThermalDevices_Enum, ADL_OVERDRIVE5_THERMALDEVICES_ENUM, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_Temperature_Get, ADL_OVERDRIVE5_TEMPERATURE_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_Temperature_Get, ADL_OVERDRIVE6_TEMPERATURE_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Adapter_NumberOfAdapters_Get, ADL_ADAPTER_NUMBEROFADAPTERS_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Main_Control_Create, ADL_MAIN_CONTROL_CREATE, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Main_Control_Destroy, ADL_MAIN_CONTROL_DESTROY, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_CurrentActivity_Get, ADL_OVERDRIVE5_CURRENTACTIVITY_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_FanSpeedInfo_Get, ADL_OVERDRIVE5_FANSPEEDINFO_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_FanSpeed_Get, ADL_OVERDRIVE5_FANSPEED_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_FanSpeed_Get, ADL_OVERDRIVE6_FANSPEED_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Adapter_Active_Get, ADL_ADAPTER_ACTIVE_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive_Caps, ADL_OVERDRIVE_CAPS, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_ODParameters_Get, ADL_OVERDRIVE5_ODPARAMETERS_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_ODPerformanceLevels_Get, ADL_OVERDRIVE5_ODPERFORMANCELEVELS_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_Temperature_Get, ADL_OVERDRIVE5_TEMPERATURE_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive5_ThermalDevices_Enum, ADL_OVERDRIVE5_THERMALDEVICES_ENUM, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_Capabilities_Get, ADL_OVERDRIVE6_CAPABILITIES_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_StateInfo_Get, ADL_OVERDRIVE6_STATEINFO_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_CurrentStatus_Get, ADL_OVERDRIVE6_CURRENTSTATUS_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_TargetTemperatureData_Get, ADL_OVERDRIVE6_TARGETTEMPERATUREDATA_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_TargetTemperatureRangeInfo_Get, ADL_OVERDRIVE6_TARGETTEMPERATURERANGEINFO_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_FanSpeed_Get, ADL_OVERDRIVE6_FANSPEED_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_StateInfo_Get, ADL_OVERDRIVE6_STATEINFO_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive6_Temperature_Get, ADL_OVERDRIVE6_TEMPERATURE_GET, ADL, 0);
HC_LOAD_FUNC(adl, ADL_Overdrive_Caps, ADL_OVERDRIVE_CAPS, ADL, 0);
HC_LOAD_FUNC(adl, ADL2_Overdrive_Caps, ADL2_OVERDRIVE_CAPS, ADL, 1);
HC_LOAD_FUNC(adl, ADL2_New_QueryPMLogData_Get, ADL2_NEW_QUERYPMLOGDATA_GET, ADL, 1);
return 0;
}
@ -270,17 +269,42 @@ int hm_ADL_Overdrive_Caps (void *hashcat_ctx, int iAdapterIndex, int *od_support
return 0;
}
int hm_ADL_Overdrive6_TargetTemperatureData_Get (void *hashcat_ctx, int iAdapterIndex, int *cur_temp, int *default_temp)
int hm_ADL2_Overdrive_Caps (void *hashcat_ctx, int iAdapterIndex, int *iSupported, int *iEnabled, int *iVersion)
{
hwmon_ctx_t *hwmon_ctx = ((hashcat_ctx_t *) hashcat_ctx)->hwmon_ctx;
ADL_PTR *adl = (ADL_PTR *) hwmon_ctx->hm_adl;
// Not sure if that makes any sense...
if (adl->ADL2_Overdrive_Caps == NULL)
{
return hm_ADL_Overdrive_Caps (hashcat_ctx, iAdapterIndex, iSupported, iEnabled, iVersion);
}
const int ADL_rc = adl->ADL2_Overdrive_Caps (NULL, iAdapterIndex, iSupported, iEnabled, iVersion);
if (ADL_rc != ADL_OK)
{
event_log_error (hashcat_ctx, "ADL2_Overdrive_Caps(): %d", ADL_rc);
return -1;
}
return 0;
}
int hm_ADL2_New_QueryPMLogData_Get (void *hashcat_ctx, int iAdapterIndex, ADLPMLogDataOutput *lpDataOutput)
{
hwmon_ctx_t *hwmon_ctx = ((hashcat_ctx_t *) hashcat_ctx)->hwmon_ctx;
ADL_PTR *adl = (ADL_PTR *) hwmon_ctx->hm_adl;
const int ADL_rc = adl->ADL_Overdrive6_TargetTemperatureData_Get (iAdapterIndex, cur_temp, default_temp);
const int ADL_rc = adl->ADL2_New_QueryPMLogData_Get (NULL, iAdapterIndex, lpDataOutput);
if (ADL_rc != ADL_OK)
{
event_log_error (hashcat_ctx, "ADL_Overdrive6_TargetTemperatureData_Get(): %d", ADL_rc);
event_log_error (hashcat_ctx, "ADL2_New_QueryPMLogData_Get(): %d", ADL_rc);
return -1;
}

@ -21,7 +21,7 @@ int _wopen (const char *path, int oflag, ...)
}
#endif
bool hc_fopen (HCFILE *fp, const char *path, char *mode)
bool hc_fopen (HCFILE *fp, const char *path, const char *mode)
{
if (path == NULL || mode == NULL) return false;
@ -130,7 +130,7 @@ bool hc_fopen (HCFILE *fp, const char *path, char *mode)
return true;
}
bool hc_fopen_raw (HCFILE *fp, const char *path, char *mode)
bool hc_fopen_raw (HCFILE *fp, const char *path, const char *mode)
{
if (path == NULL || mode == NULL) return false;

@ -430,15 +430,17 @@ static int inner1_loop (hashcat_ctx_t *hashcat_ctx)
static int outer_loop (hashcat_ctx_t *hashcat_ctx)
{
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
hashes_t *hashes = hashcat_ctx->hashes;
mask_ctx_t *mask_ctx = hashcat_ctx->mask_ctx;
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
outcheck_ctx_t *outcheck_ctx = hashcat_ctx->outcheck_ctx;
restore_ctx_t *restore_ctx = hashcat_ctx->restore_ctx;
status_ctx_t *status_ctx = hashcat_ctx->status_ctx;
straight_ctx_t *straight_ctx = hashcat_ctx->straight_ctx;
user_options_t *user_options = hashcat_ctx->user_options;
hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
hashes_t *hashes = hashcat_ctx->hashes;
mask_ctx_t *mask_ctx = hashcat_ctx->mask_ctx;
module_ctx_t *module_ctx = hashcat_ctx->module_ctx;
backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
outcheck_ctx_t *outcheck_ctx = hashcat_ctx->outcheck_ctx;
restore_ctx_t *restore_ctx = hashcat_ctx->restore_ctx;
status_ctx_t *status_ctx = hashcat_ctx->status_ctx;
straight_ctx_t *straight_ctx = hashcat_ctx->straight_ctx;
user_options_t *user_options = hashcat_ctx->user_options;
user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
status_ctx->devices_status = STATUS_INIT;
@ -463,6 +465,46 @@ static int outer_loop (hashcat_ctx_t *hashcat_ctx)
EVENT (EVENT_HASHCONFIG_POST);
/**
* deprecated notice
*/
if (module_ctx->module_deprecated_notice != MODULE_DEFAULT)
{
if (user_options->deprecated_check_disable == false)
{
if ((user_options->show == true) || (user_options->left == true))
{
const char *module_deprecated_notice = module_ctx->module_deprecated_notice (hashconfig, user_options, user_options_extra);
event_log_warning (hashcat_ctx, "%s", module_deprecated_notice);
event_log_warning (hashcat_ctx, NULL);
}
else if (user_options->benchmark == true)
{
if (user_options->hash_mode_chgd == true)
{
const char *module_deprecated_notice = module_ctx->module_deprecated_notice (hashconfig, user_options, user_options_extra);
event_log_warning (hashcat_ctx, "%s", module_deprecated_notice);
event_log_warning (hashcat_ctx, NULL);
}
else
{
return 0;
}
}
else
{
const char *module_deprecated_notice = module_ctx->module_deprecated_notice (hashconfig, user_options, user_options_extra);
event_log_error (hashcat_ctx, "%s", module_deprecated_notice);
return 0;
}
}
}
/**
* generate hashlist filename for later use
*/

@ -300,7 +300,7 @@ int save_hash (hashcat_ctx_t *hashcat_ctx)
return 0;
}
void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, plain_t *plain)
int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, plain_t *plain)
{
const debugfile_ctx_t *debugfile_ctx = hashcat_ctx->debugfile_ctx;
const hashes_t *hashes = hashcat_ctx->hashes;
@ -313,23 +313,63 @@ void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pl
void *tmps = NULL;
cl_event opencl_event;
int rc = -1;
if (hashconfig->opts_type & OPTS_TYPE_COPY_TMPS)
{
tmps = hcmalloc (hashconfig->tmp_size);
if (device_param->is_cuda == true)
{
hc_cuMemcpyDtoH (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);
rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->cuda_stream);
if (rc == 0)
{
rc = hc_cuEventRecord (hashcat_ctx, device_param->cuda_event3, device_param->cuda_stream);
}
if (rc == -1)
{
hcfree (tmps);
return -1;
}
}
if (device_param->is_hip == true)
{
hc_hipMemcpyDtoH (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);
rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->hip_stream);
if (rc == 0)
{
rc = hc_hipEventRecord (hashcat_ctx, device_param->hip_event3, device_param->hip_stream);
}
if (rc == -1)
{
hcfree (tmps);
return -1;
}
}
if (device_param->is_opencl == true)
{
hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_TRUE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, NULL);
rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_FALSE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, &opencl_event);
if (rc == 0)
{
rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue);
}
if (rc == -1)
{
hcfree (tmps);
return -1;
}
}
}
@ -337,15 +377,14 @@ void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pl
u8 *out_buf = hashes->out_buf;
int out_len = hash_encode (hashcat_ctx->hashconfig, hashcat_ctx->hashes, hashcat_ctx->module_ctx, (char *) out_buf, HCBUFSIZ_LARGE, salt_pos, digest_pos);
int out_len = hash_encode (hashconfig, hashes, module_ctx, (char *) out_buf, HCBUFSIZ_LARGE, salt_pos, digest_pos);
out_buf[out_len] = 0;
// plain
u8 plain_buf[0x1000]; // while the password itself can have only length 256, the module could encode it with something like base64 which inflates the requires buffer size
memset (plain_buf, 0, sizeof (plain_buf));
u8 plain_buf[HCBUFSIZ_TINY] = { 0 }; // while the password itself can have only length 256, the module could encode it with something like base64 which inflates the requires buffer size
u8 postprocess_buf[HCBUFSIZ_TINY] = { 0 };
u8 *plain_ptr = plain_buf;
@ -355,18 +394,27 @@ void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pl
if (module_ctx->module_build_plain_postprocess != MODULE_DEFAULT)
{
u8 temp_buf[0x1000];
if (hashconfig->opts_type & OPTS_TYPE_COPY_TMPS)
{
if (device_param->is_cuda == true)
{
if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event3) == -1) return -1;
}
memset (temp_buf, 0, sizeof (temp_buf));
if (device_param->is_hip == true)
{
if (hc_hipEventSynchronize (hashcat_ctx, device_param->hip_event3) == -1) return -1;
}
const int temp_len = module_ctx->module_build_plain_postprocess (hashcat_ctx->hashconfig, hashcat_ctx->hashes, tmps, (u32 *) plain_buf, sizeof (plain_buf), plain_len, (u32 *)temp_buf, sizeof (temp_buf));
if (device_param->is_opencl == true)
{
if (hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event) == -1) return -1;
}
}
if (temp_len < (int) sizeof (plain_buf))
{
memcpy (plain_buf, temp_buf, temp_len);
plain_len = module_ctx->module_build_plain_postprocess (hashconfig, hashes, tmps, (u32 *) plain_buf, sizeof (plain_buf), plain_len, (u32 *) postprocess_buf, sizeof (postprocess_buf));
plain_len = temp_len;
}
plain_ptr = postprocess_buf;
}
// crackpos
@ -407,6 +455,24 @@ void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pl
if (module_ctx->module_hash_encode_potfile != MODULE_DEFAULT)
{
if (hashconfig->opts_type & OPTS_TYPE_COPY_TMPS)
{
if (device_param->is_cuda == true)
{
if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event3) == -1) return -1;
}
if (device_param->is_hip == true)
{
if (hc_hipEventSynchronize (hashcat_ctx, device_param->hip_event3) == -1) return -1;
}
if (device_param->is_opencl == true)
{
if (hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event) == -1) return -1;
}
}
salt_t *salts_buf = hashes->salts_buf;
salts_buf += salt_pos;
@ -471,7 +537,14 @@ void check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pl
if (hashconfig->opts_type & OPTS_TYPE_COPY_TMPS)
{
hcfree (tmps);
if (device_param->is_opencl == true)
{
if (hc_clReleaseEvent (hashcat_ctx, opencl_event) == -1) return -1;
}
}
return 0;
}
//int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 salt_pos)
@ -485,172 +558,204 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
u32 num_cracked = 0;
int CU_rc;
int HIP_rc;
int CL_rc;
int rc = -1;
if (device_param->is_cuda == true)
{
CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32));
if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
if (CU_rc == -1) return -1;
if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
}
if (device_param->is_hip == true)
{
HIP_rc = hc_hipMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32));
if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
if (HIP_rc == -1) return -1;
if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
}
if (device_param->is_opencl == true)
{
CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
if (CL_rc == -1) return -1;
/* blocking */
if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL) == -1) return -1;
}
if (user_options->speed_only == true)
if (num_cracked == 0 || user_options->speed_only == true)
{
// we want the hc_clEnqueueReadBuffer to run in benchmark mode because it has an influence in performance
// we want to get the num_cracked in benchmark mode because it has an influence in performance
// however if the benchmark cracks the artificial hash used for benchmarks we don't want to see that!
return 0;
}
if (num_cracked)
plain_t *cracked = (plain_t *) hcmalloc (num_cracked * sizeof (plain_t));
if (device_param->is_cuda == true)
{
plain_t *cracked = (plain_t *) hccalloc (num_cracked, sizeof (plain_t));
rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->cuda_stream);
if (device_param->is_cuda == true)
if (rc == 0)
{
rc = hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream);
}
if (rc == -1)
{
CU_rc = hc_cuMemcpyDtoH (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t));
hcfree (cracked);
if (CU_rc == -1) return -1;
return -1;
}
}
if (device_param->is_hip == true)
if (device_param->is_hip == true)
{
rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->hip_stream);
if (rc == 0)
{
HIP_rc = hc_hipMemcpyDtoH (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t));
rc = hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream);
}
if (HIP_rc == -1) return -1;
if (rc == -1)
{
hcfree (cracked);
return -1;
}
}
if (device_param->is_opencl == true)
if (device_param->is_opencl == true)
{
/* blocking */
rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_plain_bufs, CL_TRUE, 0, num_cracked * sizeof (plain_t), cracked, 0, NULL, NULL);
if (rc == -1)
{
CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_plain_bufs, CL_TRUE, 0, num_cracked * sizeof (plain_t), cracked, 0, NULL, NULL);
hcfree (cracked);
if (CL_rc == -1) return -1;
return -1;
}
}
u32 cpt_cracked = 0;
u32 cpt_cracked = 0;
hc_thread_mutex_lock (status_ctx->mux_display);
hc_thread_mutex_lock (status_ctx->mux_display);
for (u32 i = 0; i < num_cracked; i++)
{
const u32 hash_pos = cracked[i].hash_pos;
for (u32 i = 0; i < num_cracked; i++)
{
const u32 hash_pos = cracked[i].hash_pos;
if (hashes->digests_shown[hash_pos] == 1) continue;
if (hashes->digests_shown[hash_pos] == 1) continue;
const u32 salt_pos = cracked[i].salt_pos;
salt_t *salt_buf = &hashes->salts_buf[salt_pos];
const u32 salt_pos = cracked[i].salt_pos;
salt_t *salt_buf = &hashes->salts_buf[salt_pos];
if ((hashconfig->opts_type & OPTS_TYPE_PT_NEVERCRACK) == 0)
{
hashes->digests_shown[hash_pos] = 1;
if ((hashconfig->opts_type & OPTS_TYPE_PT_NEVERCRACK) == 0)
{
hashes->digests_shown[hash_pos] = 1;
hashes->digests_done++;
hashes->digests_done++;
cpt_cracked++;
cpt_cracked++;
salt_buf->digests_done++;
salt_buf->digests_done++;
if (salt_buf->digests_done == salt_buf->digests_cnt)
{
hashes->salts_shown[salt_pos] = 1;
if (salt_buf->digests_done == salt_buf->digests_cnt)
{
hashes->salts_shown[salt_pos] = 1;
hashes->salts_done++;
}
hashes->salts_done++;
}
}
if (hashes->salts_done == hashes->salts_cnt) mycracked (hashcat_ctx);
if (hashes->salts_done == hashes->salts_cnt) mycracked (hashcat_ctx);
check_hash (hashcat_ctx, device_param, &cracked[i]);
rc = check_hash (hashcat_ctx, device_param, &cracked[i]);
if (hashconfig->opts_type & OPTS_TYPE_PT_NEVERCRACK)
{
// we need to reset cracked state on the device
// otherwise host thinks again and again the hash was cracked
// and returns invalid password each time
if (rc == -1)
{
break;
}
memset (hashes->digests_shown_tmp, 0, salt_buf->digests_cnt * sizeof (u32));
if (hashconfig->opts_type & OPTS_TYPE_PT_NEVERCRACK)
{
// we need to reset cracked state on the device
// otherwise host thinks again and again the hash was cracked
// and returns invalid password each time
if (device_param->is_cuda == true)
{
CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_digests_shown + (salt_buf->digests_offset * sizeof (u32)), &hashes->digests_shown_tmp[salt_buf->digests_offset], salt_buf->digests_cnt * sizeof (u32));
if (device_param->is_cuda == true)
{
rc = run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_digests_shown + (salt_buf->digests_offset * sizeof (u32)), salt_buf->digests_cnt * sizeof (u32));
if (CU_rc == -1) return -1;
if (rc == -1)
{
break;
}
}
if (device_param->is_hip == true)
{
HIP_rc = hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_digests_shown + (salt_buf->digests_offset * sizeof (u32)), &hashes->digests_shown_tmp[salt_buf->digests_offset], salt_buf->digests_cnt * sizeof (u32));
if (device_param->is_hip == true)
{
rc = run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_digests_shown + (salt_buf->digests_offset * sizeof (u32)), salt_buf->digests_cnt * sizeof (u32));
if (HIP_rc == -1) return -1;
if (rc == -1)
{
break;
}
}
if (device_param->is_opencl == true)
{
CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_shown, CL_TRUE, salt_buf->digests_offset * sizeof (u32), salt_buf->digests_cnt * sizeof (u32), &hashes->digests_shown_tmp[salt_buf->digests_offset], 0, NULL, NULL);
if (device_param->is_opencl == true)
{
/* NOTE: run_opencl_kernel_bzero() does not handle buffer offset */
rc = run_opencl_kernel_memset32 (hashcat_ctx, device_param, device_param->opencl_d_digests_shown, salt_buf->digests_offset * sizeof (u32), 0, salt_buf->digests_cnt * sizeof (u32));
if (CL_rc == -1) return -1;
if (rc == -1)
{
break;
}
}
}
}
hc_thread_mutex_unlock (status_ctx->mux_display);
hcfree (cracked);
if (cpt_cracked > 0)
{
hc_thread_mutex_lock (status_ctx->mux_display);
hc_thread_mutex_unlock (status_ctx->mux_display);
cpt_ctx->cpt_buf[cpt_ctx->cpt_pos].timestamp = time (NULL);
cpt_ctx->cpt_buf[cpt_ctx->cpt_pos].cracked = cpt_cracked;
hcfree (cracked);
cpt_ctx->cpt_pos++;
if (rc == -1)
{
return -1;
}
cpt_ctx->cpt_total += cpt_cracked;
if (cpt_cracked > 0)
{
hc_thread_mutex_lock (status_ctx->mux_display);
if (cpt_ctx->cpt_pos == CPT_CACHE) cpt_ctx->cpt_pos = 0;
cpt_ctx->cpt_buf[cpt_ctx->cpt_pos].timestamp = time (NULL);
cpt_ctx->cpt_buf[cpt_ctx->cpt_pos].cracked = cpt_cracked;
hc_thread_mutex_unlock (status_ctx->mux_display);
}
cpt_ctx->cpt_pos++;
num_cracked = 0;
cpt_ctx->cpt_total += cpt_cracked;
if (device_param->is_cuda == true)
{
CU_rc = hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_result, &num_cracked, sizeof (u32));
if (cpt_ctx->cpt_pos == CPT_CACHE) cpt_ctx->cpt_pos = 0;
if (CU_rc == -1) return -1;
}
hc_thread_mutex_unlock (status_ctx->mux_display);
}
if (device_param->is_hip == true)
{
HIP_rc = hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_result, &num_cracked, sizeof (u32));
if (device_param->is_cuda == true)
{
if (run_cuda_kernel_bzero (hashcat_ctx, device_param, device_param->cuda_d_result, sizeof (u32)) == -1) return -1;
}
if (HIP_rc == -1) return -1;
}
if (device_param->is_hip == true)
{
if (run_hip_kernel_bzero (hashcat_ctx, device_param, device_param->hip_d_result, sizeof (u32)) == -1) return -1;
}
if (device_param->is_opencl == true)
{
CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), &num_cracked, 0, NULL, NULL);
if (device_param->is_opencl == true)
{
if (run_opencl_kernel_bzero (hashcat_ctx, device_param, device_param->opencl_d_result, sizeof (u32)) == -1) return -1;
if (CL_rc == -1) return -1;
}
if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
}
return 0;
@ -1569,7 +1674,6 @@ int hashes_init_stage2 (hashcat_ctx_t *hashcat_ctx)
u32 digests_done = 0;
u32 *digests_shown = (u32 *) hccalloc (digests_cnt, sizeof (u32));
u32 *digests_shown_tmp = (u32 *) hccalloc (digests_cnt, sizeof (u32));
u32 salts_cnt = 0;
u32 salts_done = 0;
@ -1706,7 +1810,6 @@ int hashes_init_stage2 (hashcat_ctx_t *hashcat_ctx)
hashes->digests_done = digests_done;
hashes->digests_buf = digests_buf_new;
hashes->digests_shown = digests_shown;
hashes->digests_shown_tmp = digests_shown_tmp;
hashes->salts_cnt = salts_cnt;
hashes->salts_done = salts_done;
@ -1977,7 +2080,7 @@ int hashes_init_selftest (hashcat_ctx_t *hashcat_ctx)
memcpy (hashconfig_st, hashconfig, sizeof (hashconfig_t));
hashconfig_st->separator = SEPARATOR;
hashconfig_st->separator = ':';
if (user_options->hex_salt)
{
@ -2193,7 +2296,6 @@ void hashes_destroy (hashcat_ctx_t *hashcat_ctx)
hcfree (hashes->digests_buf);
hcfree (hashes->digests_shown);
hcfree (hashes->digests_shown_tmp);
hcfree (hashes->salts_buf);
hcfree (hashes->salts_shown);

@ -10,16 +10,16 @@
#include "hlfmt.h"
#include "shared.h"
static const char *HLFMT_TEXT_HASHCAT = "native hashcat";
static const char *HLFMT_TEXT_PWDUMP = "pwdump";
static const char *HLFMT_TEXT_PASSWD = "passwd";
static const char *HLFMT_TEXT_SHADOW = "shadow";
static const char *HLFMT_TEXT_DCC = "DCC";
static const char *HLFMT_TEXT_DCC2 = "DCC 2";
static const char *HLFMT_TEXT_NETNTLM1 = "NetNTLMv1";
static const char *HLFMT_TEXT_NETNTLM2 = "NetNTLMv2";
static const char *HLFMT_TEXT_NSLDAP = "nsldap";
static const char *HLFMT_TEXT_NSLDAPS = "nsldaps";
static const char *const HLFMT_TEXT_HASHCAT = "native hashcat";
static const char *const HLFMT_TEXT_PWDUMP = "pwdump";
static const char *const HLFMT_TEXT_PASSWD = "passwd";
static const char *const HLFMT_TEXT_SHADOW = "shadow";
static const char *const HLFMT_TEXT_DCC = "DCC";
static const char *const HLFMT_TEXT_DCC2 = "DCC 2";
static const char *const HLFMT_TEXT_NETNTLM1 = "NetNTLMv1";
static const char *const HLFMT_TEXT_NETNTLM2 = "NetNTLMv2";
static const char *const HLFMT_TEXT_NSLDAP = "nsldap";
static const char *const HLFMT_TEXT_NSLDAPS = "nsldaps";
// hlfmt hashcat

@ -109,19 +109,7 @@ int hm_get_threshold_slowdown_with_devices_idx (hashcat_ctx_t *hashcat_ctx, cons
}
else if (hwmon_ctx->hm_device[backend_device_idx].od_version == 6)
{
int CurrentValue = 0;
int DefaultValue = 0;
if (hm_ADL_Overdrive6_TargetTemperatureData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &CurrentValue, &DefaultValue) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].threshold_slowdown_get_supported = false;
return -1;
}
// the return value has never been tested since hm_ADL_Overdrive6_TargetTemperatureData_Get() never worked on any system. expect problems.
return DefaultValue;
}
}
}
@ -346,6 +334,22 @@ int hm_get_temperature_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int b
return Temperature / 1000;
}
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 8)
{
ADLPMLogDataOutput odlpDataOutput;
memset (&odlpDataOutput, 0, sizeof (ADLPMLogDataOutput));
if (hm_ADL2_New_QueryPMLogData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &odlpDataOutput) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].temperature_get_supported = false;
return -1;
}
return odlpDataOutput.sensors[PMLOG_TEMPERATURE_EDGE].value;
}
}
if (hwmon_ctx->hm_sysfs_amdgpu)
@ -431,8 +435,37 @@ int hm_get_fanpolicy_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int bac
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 6)
{
ADLOD6FanSpeedInfo lpFanSpeedInfo;
memset (&lpFanSpeedInfo, 0, sizeof (lpFanSpeedInfo));
if (hm_ADL_Overdrive6_FanSpeed_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &lpFanSpeedInfo) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].fanpolicy_get_supported = false;
hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
return -1;
}
return 1;
}
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 8)
{
ADLPMLogDataOutput odlpDataOutput;
memset (&odlpDataOutput, 0, sizeof (ADLPMLogDataOutput));
if (hm_ADL2_New_QueryPMLogData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &odlpDataOutput) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].fanpolicy_get_supported = false;
hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
return -1;
}
return odlpDataOutput.sensors[PMLOG_FAN_PERCENTAGE].supported;
}
}
if (hwmon_ctx->hm_sysfs_amdgpu)
@ -542,6 +575,22 @@ int hm_get_fanspeed_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int back
return faninfo.iFanSpeedPercent;
}
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 8)
{
ADLPMLogDataOutput odlpDataOutput;
memset (&odlpDataOutput, 0, sizeof (ADLPMLogDataOutput));
if (hm_ADL2_New_QueryPMLogData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &odlpDataOutput) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].fanspeed_get_supported = false;
return -1;
}
return odlpDataOutput.sensors[PMLOG_FAN_PERCENTAGE].value;
}
}
if (hwmon_ctx->hm_sysfs_amdgpu)
@ -617,18 +666,37 @@ int hm_get_buslanes_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int back
{
if (hwmon_ctx->hm_adl)
{
ADLPMActivity PMActivity;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
{
ADLPMActivity PMActivity;
PMActivity.iSize = sizeof (ADLPMActivity);
PMActivity.iSize = sizeof (ADLPMActivity);
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
return -1;
return -1;
}
return PMActivity.iCurrentBusLanes;
}
return PMActivity.iCurrentBusLanes;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 8)
{
ADLPMLogDataOutput odlpDataOutput;
memset (&odlpDataOutput, 0, sizeof (ADLPMLogDataOutput));
if (hm_ADL2_New_QueryPMLogData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &odlpDataOutput) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].buslanes_get_supported = false;
return -1;
}
return odlpDataOutput.sensors[PMLOG_BUS_LANES].value;
}
}
if (hwmon_ctx->hm_sysfs_amdgpu)
@ -704,18 +772,37 @@ int hm_get_utilization_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int b
{
if (hwmon_ctx->hm_adl)
{
ADLPMActivity PMActivity;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
{
ADLPMActivity PMActivity;
PMActivity.iSize = sizeof (ADLPMActivity);
PMActivity.iSize = sizeof (ADLPMActivity);
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported = false;
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported = false;
return -1;
return -1;
}
return PMActivity.iActivityPercent;
}
return PMActivity.iActivityPercent;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 8)
{
ADLPMLogDataOutput odlpDataOutput;
memset (&odlpDataOutput, 0, sizeof (ADLPMLogDataOutput));
if (hm_ADL2_New_QueryPMLogData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &odlpDataOutput) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].utilization_get_supported = false;
return -1;
}
return odlpDataOutput.sensors[PMLOG_INFO_ACTIVITY_GFX].value;
}
}
if (hwmon_ctx->hm_sysfs_amdgpu)
@ -808,18 +895,37 @@ int hm_get_memoryspeed_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int b
{
if (hwmon_ctx->hm_adl)
{
ADLPMActivity PMActivity;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
{
ADLPMActivity PMActivity;
PMActivity.iSize = sizeof (ADLPMActivity);
PMActivity.iSize = sizeof (ADLPMActivity);
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
return -1;
return -1;
}
return PMActivity.iMemoryClock / 100;
}
return PMActivity.iMemoryClock / 100;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 8)
{
ADLPMLogDataOutput odlpDataOutput;
memset (&odlpDataOutput, 0, sizeof (ADLPMLogDataOutput));
if (hm_ADL2_New_QueryPMLogData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &odlpDataOutput) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].memoryspeed_get_supported = false;
return -1;
}
return odlpDataOutput.sensors[PMLOG_CLK_MEMCLK].value;
}
}
if (hwmon_ctx->hm_sysfs_amdgpu)
@ -895,18 +1001,37 @@ int hm_get_corespeed_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int bac
{
if (hwmon_ctx->hm_adl)
{
ADLPMActivity PMActivity;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 5)
{
ADLPMActivity PMActivity;
PMActivity.iSize = sizeof (ADLPMActivity);
PMActivity.iSize = sizeof (ADLPMActivity);
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
if (hm_ADL_Overdrive_CurrentActivity_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &PMActivity) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
return -1;
return -1;
}
return PMActivity.iEngineClock / 100;
}
return PMActivity.iEngineClock / 100;
if (hwmon_ctx->hm_device[backend_device_idx].od_version == 8)
{
ADLPMLogDataOutput odlpDataOutput;
memset (&odlpDataOutput, 0, sizeof (ADLPMLogDataOutput));
if (hm_ADL2_New_QueryPMLogData_Get (hashcat_ctx, hwmon_ctx->hm_device[backend_device_idx].adl, &odlpDataOutput) == -1)
{
hwmon_ctx->hm_device[backend_device_idx].corespeed_get_supported = false;
return -1;
}
return odlpDataOutput.sensors[PMLOG_CLK_GFXCLK].value;
}
}
if (hwmon_ctx->hm_sysfs_amdgpu)
@ -1400,7 +1525,9 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
int od_enabled = 0;
int od_version = 0;
hm_ADL_Overdrive_Caps (hashcat_ctx, lpAdapterInfo[i].iAdapterIndex, &od_supported, &od_enabled, &od_version);
hm_ADL2_Overdrive_Caps (hashcat_ctx, lpAdapterInfo[i].iAdapterIndex, &od_supported, &od_enabled, &od_version);
if (od_version < 8) od_version = 5;
hm_adapters_adl[device_id].od_version = od_version;
@ -1534,12 +1661,6 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
hwmon_ctx->enabled = true;
/**
* save buffer required for later restores
*/
hwmon_ctx->od_clock_mem_status = (ADLOD6MemClockState *) hccalloc (backend_ctx->backend_devices_cnt, sizeof (ADLOD6MemClockState));
/**
* HM devices: copy
*/
@ -1795,8 +1916,6 @@ void hwmon_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
// free memory
hcfree (hwmon_ctx->od_clock_mem_status);
hcfree (hwmon_ctx->hm_device);
memset (hwmon_ctx, 0, sizeof (hwmon_ctx_t));

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save