diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 8d20c2040..3e3fe573c 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -7,7 +7,7 @@ * pure scalar functions */ -inline int ffz (const u32 v) +int ffz (const u32 v) { #ifdef _unroll #pragma unroll @@ -22,7 +22,7 @@ inline int ffz (const u32 v) return -1; } -inline int hash_comp (const u32 d1[4], __global const u32 *d2) +int hash_comp (const u32 d1[4], __global const u32 *d2) { if (d1[3] > d2[DGST_R3]) return ( 1); if (d1[3] < d2[DGST_R3]) return (-1); @@ -36,7 +36,7 @@ inline int hash_comp (const u32 d1[4], __global const u32 *d2) return (0); } -inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global const digest_t *digests_buf) +int find_hash (const u32 digest[4], const u32 digests_cnt, __global const digest_t *digests_buf) { for (u32 l = 0, r = digests_cnt; r; r >>= 1) { @@ -59,12 +59,12 @@ inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global const return (-1); } -inline u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest) +u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest) { return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f))); } -inline u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2) +u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2) { if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0); if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0); @@ -79,7 +79,7 @@ inline u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global return (1); } -inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u32 gid, const u32 il_pos) +void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u32 gid, const u32 il_pos) { const u32 idx = atomic_inc (d_result); @@ -100,7 +100,7 @@ inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, con plains_buf[idx].il_pos = il_pos; } -inline int count_char (const u32 *buf, const int elems, const u32 c) +int count_char (const u32 *buf, const int elems, const u32 c) { int r = 0; @@ -117,7 +117,7 @@ inline int count_char (const u32 *buf, const int elems, const u32 c) return r; } -inline float get_entropy (const u32 *buf, const int elems) +float get_entropy (const u32 *buf, const int elems) { const int length = elems * 4; @@ -144,7 +144,7 @@ inline float get_entropy (const u32 *buf, const int elems) * vector functions */ -inline void truncate_block_4x4_le (u32x w0[4], const u32 len) +void truncate_block_4x4_le (u32x w0[4], const u32 len) { switch (len) { @@ -254,7 +254,7 @@ inline void truncate_block_4x4_le (u32x w0[4], const u32 len) } } -inline void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) { switch (len) { @@ -1060,7 +1060,7 @@ inline void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[ } } -inline void truncate_block_4x4_be (u32x w0[4], const u32 len) +void truncate_block_4x4_be (u32x w0[4], const u32 len) { switch (len) { @@ -1170,7 +1170,7 @@ inline void truncate_block_4x4_be (u32x w0[4], const u32 len) } } -inline void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) { switch (len) { @@ -1976,7 +1976,7 @@ inline void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[ } } -inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x3727); @@ -2001,7 +2001,7 @@ inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x1707); @@ -2026,7 +2026,7 @@ inline void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x7372); @@ -2051,7 +2051,7 @@ inline void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) +void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) { #ifdef IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x4602); @@ -2072,7 +2072,7 @@ inline void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } -inline void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) +void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) { #ifdef IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x6420); @@ -2093,7 +2093,7 @@ inline void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } -inline void append_0x80_1x4 (u32x w0[4], const u32 offset) +void append_0x80_1x4 (u32x w0[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2103,7 +2103,7 @@ inline void append_0x80_1x4 (u32x w0[4], const u32 offset) w0[3] |= (offset >= 12) ? tmp : 0; } -inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) +void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2117,7 +2117,7 @@ inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) +void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2135,7 +2135,7 @@ inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offse w2[3] |= (offset >= 44) ? tmp : 0; } -inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2157,7 +2157,7 @@ inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], con w3[3] |= (offset >= 60) ? tmp : 0; } -inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { switch (offset) { @@ -2675,7 +2675,7 @@ inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32 } } -inline void append_0x80_1x16 (u32x w[16], const u32 offset) +void append_0x80_1x16 (u32x w[16], const u32 offset) { switch (offset) { @@ -2937,251 +2937,163 @@ inline void append_0x80_1x16 (u32x w[16], const u32 offset) } } -inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); switch (offset / 4) { case 0: - w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 2: - w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 3: - w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 4: - w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 5: - w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 6: - w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -3189,32 +3101,18 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 7: - w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -3223,30 +3121,17 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 8: - w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -3256,28 +3141,16 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 9: - w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -3288,26 +3161,15 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 10: - w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -3319,24 +3181,14 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 11: - w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -3349,22 +3201,13 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 12: - w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -3378,20 +3221,12 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 13: - w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -3406,18 +3241,11 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 14: - w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -3433,16 +3261,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 15: - w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -3459,18 +3281,28 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - break; } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -3798,12 +3630,525 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif } -inline void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) +void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + switch (offset / 4) + { + case 0: + c0[0] = amd_bytealign (w3[3], 0, offset); + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = amd_bytealign (w3[3], 0, offset); + c0[0] = amd_bytealign (w3[2], w3[3], offset); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = amd_bytealign (w3[3], 0, offset); + c0[1] = amd_bytealign (w3[2], w3[3], offset); + c0[0] = amd_bytealign (w3[1], w3[2], offset); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = amd_bytealign (w3[3], 0, offset); + c0[2] = amd_bytealign (w3[2], w3[3], offset); + c0[1] = amd_bytealign (w3[1], w3[2], offset); + c0[0] = amd_bytealign (w3[0], w3[1], offset); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = amd_bytealign (w3[3], 0, offset); + c0[3] = amd_bytealign (w3[2], w3[3], offset); + c0[2] = amd_bytealign (w3[1], w3[2], offset); + c0[1] = amd_bytealign (w3[0], w3[1], offset); + c0[0] = amd_bytealign (w2[3], w3[0], offset); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = amd_bytealign (w3[3], 0, offset); + c1[0] = amd_bytealign (w3[2], w3[3], offset); + c0[3] = amd_bytealign (w3[1], w3[2], offset); + c0[2] = amd_bytealign (w3[0], w3[1], offset); + c0[1] = amd_bytealign (w2[3], w3[0], offset); + c0[0] = amd_bytealign (w2[2], w2[3], offset); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = amd_bytealign (w3[3], 0, offset); + c1[1] = amd_bytealign (w3[2], w3[3], offset); + c1[0] = amd_bytealign (w3[1], w3[2], offset); + c0[3] = amd_bytealign (w3[0], w3[1], offset); + c0[2] = amd_bytealign (w2[3], w3[0], offset); + c0[1] = amd_bytealign (w2[2], w2[3], offset); + c0[0] = amd_bytealign (w2[1], w2[2], offset); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = amd_bytealign (w3[3], 0, offset); + c1[2] = amd_bytealign (w3[2], w3[3], offset); + c1[1] = amd_bytealign (w3[1], w3[2], offset); + c1[0] = amd_bytealign (w3[0], w3[1], offset); + c0[3] = amd_bytealign (w2[3], w3[0], offset); + c0[2] = amd_bytealign (w2[2], w2[3], offset); + c0[1] = amd_bytealign (w2[1], w2[2], offset); + c0[0] = amd_bytealign (w2[0], w2[1], offset); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = amd_bytealign (w3[3], 0, offset); + c1[3] = amd_bytealign (w3[2], w3[3], offset); + c1[2] = amd_bytealign (w3[1], w3[2], offset); + c1[1] = amd_bytealign (w3[0], w3[1], offset); + c1[0] = amd_bytealign (w2[3], w3[0], offset); + c0[3] = amd_bytealign (w2[2], w2[3], offset); + c0[2] = amd_bytealign (w2[1], w2[2], offset); + c0[1] = amd_bytealign (w2[0], w2[1], offset); + c0[0] = amd_bytealign (w1[3], w2[0], offset); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = amd_bytealign (w3[3], 0, offset); + c2[0] = amd_bytealign (w3[2], w3[3], offset); + c1[3] = amd_bytealign (w3[1], w3[2], offset); + c1[2] = amd_bytealign (w3[0], w3[1], offset); + c1[1] = amd_bytealign (w2[3], w3[0], offset); + c1[0] = amd_bytealign (w2[2], w2[3], offset); + c0[3] = amd_bytealign (w2[1], w2[2], offset); + c0[2] = amd_bytealign (w2[0], w2[1], offset); + c0[1] = amd_bytealign (w1[3], w2[0], offset); + c0[0] = amd_bytealign (w1[2], w1[3], offset); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = amd_bytealign (w3[3], 0, offset); + c2[1] = amd_bytealign (w3[2], w3[3], offset); + c2[0] = amd_bytealign (w3[1], w3[2], offset); + c1[3] = amd_bytealign (w3[0], w3[1], offset); + c1[2] = amd_bytealign (w2[3], w3[0], offset); + c1[1] = amd_bytealign (w2[2], w2[3], offset); + c1[0] = amd_bytealign (w2[1], w2[2], offset); + c0[3] = amd_bytealign (w2[0], w2[1], offset); + c0[2] = amd_bytealign (w1[3], w2[0], offset); + c0[1] = amd_bytealign (w1[2], w1[3], offset); + c0[0] = amd_bytealign (w1[1], w1[2], offset); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = amd_bytealign (w3[3], 0, offset); + c2[2] = amd_bytealign (w3[2], w3[3], offset); + c2[1] = amd_bytealign (w3[1], w3[2], offset); + c2[0] = amd_bytealign (w3[0], w3[1], offset); + c1[3] = amd_bytealign (w2[3], w3[0], offset); + c1[2] = amd_bytealign (w2[2], w2[3], offset); + c1[1] = amd_bytealign (w2[1], w2[2], offset); + c1[0] = amd_bytealign (w2[0], w2[1], offset); + c0[3] = amd_bytealign (w1[3], w2[0], offset); + c0[2] = amd_bytealign (w1[2], w1[3], offset); + c0[1] = amd_bytealign (w1[1], w1[2], offset); + c0[0] = amd_bytealign (w1[0], w1[1], offset); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = amd_bytealign (w3[3], 0, offset); + c2[3] = amd_bytealign (w3[2], w3[3], offset); + c2[2] = amd_bytealign (w3[1], w3[2], offset); + c2[1] = amd_bytealign (w3[0], w3[1], offset); + c2[0] = amd_bytealign (w2[3], w3[0], offset); + c1[3] = amd_bytealign (w2[2], w2[3], offset); + c1[2] = amd_bytealign (w2[1], w2[2], offset); + c1[1] = amd_bytealign (w2[0], w2[1], offset); + c1[0] = amd_bytealign (w1[3], w2[0], offset); + c0[3] = amd_bytealign (w1[2], w1[3], offset); + c0[2] = amd_bytealign (w1[1], w1[2], offset); + c0[1] = amd_bytealign (w1[0], w1[1], offset); + c0[0] = amd_bytealign (w0[3], w1[0], offset); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = amd_bytealign (w3[3], 0, offset); + c3[0] = amd_bytealign (w3[2], w3[3], offset); + c2[3] = amd_bytealign (w3[1], w3[2], offset); + c2[2] = amd_bytealign (w3[0], w3[1], offset); + c2[1] = amd_bytealign (w2[3], w3[0], offset); + c2[0] = amd_bytealign (w2[2], w2[3], offset); + c1[3] = amd_bytealign (w2[1], w2[2], offset); + c1[2] = amd_bytealign (w2[0], w2[1], offset); + c1[1] = amd_bytealign (w1[3], w2[0], offset); + c1[0] = amd_bytealign (w1[2], w1[3], offset); + c0[3] = amd_bytealign (w1[1], w1[2], offset); + c0[2] = amd_bytealign (w1[0], w1[1], offset); + c0[1] = amd_bytealign (w0[3], w1[0], offset); + c0[0] = amd_bytealign (w0[2], w0[3], offset); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = amd_bytealign (w3[3], 0, offset); + c3[1] = amd_bytealign (w3[2], w3[3], offset); + c3[0] = amd_bytealign (w3[1], w3[2], offset); + c2[3] = amd_bytealign (w3[0], w3[1], offset); + c2[2] = amd_bytealign (w2[3], w3[0], offset); + c2[1] = amd_bytealign (w2[2], w2[3], offset); + c2[0] = amd_bytealign (w2[1], w2[2], offset); + c1[3] = amd_bytealign (w2[0], w2[1], offset); + c1[2] = amd_bytealign (w1[3], w2[0], offset); + c1[1] = amd_bytealign (w1[2], w1[3], offset); + c1[0] = amd_bytealign (w1[1], w1[2], offset); + c0[3] = amd_bytealign (w1[0], w1[1], offset); + c0[2] = amd_bytealign (w0[3], w1[0], offset); + c0[1] = amd_bytealign (w0[2], w0[3], offset); + c0[0] = amd_bytealign (w0[1], w0[2], offset); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = amd_bytealign (w3[3], 0, offset); + c3[2] = amd_bytealign (w3[2], w3[3], offset); + c3[1] = amd_bytealign (w3[1], w3[2], offset); + c3[0] = amd_bytealign (w3[0], w3[1], offset); + c2[3] = amd_bytealign (w2[3], w3[0], offset); + c2[2] = amd_bytealign (w2[2], w2[3], offset); + c2[1] = amd_bytealign (w2[1], w2[2], offset); + c2[0] = amd_bytealign (w2[0], w2[1], offset); + c1[3] = amd_bytealign (w1[3], w2[0], offset); + c1[2] = amd_bytealign (w1[2], w1[3], offset); + c1[1] = amd_bytealign (w1[1], w1[2], offset); + c1[0] = amd_bytealign (w1[0], w1[1], offset); + c0[3] = amd_bytealign (w0[3], w1[0], offset); + c0[2] = amd_bytealign (w0[2], w0[3], offset); + c0[1] = amd_bytealign (w0[1], w0[2], offset); + c0[0] = amd_bytealign (w0[0], w0[1], offset); + w3[3] = amd_bytealign ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + c0[0] = swap32 (c0[0]); + c0[1] = swap32 (c0[1]); + c0[2] = swap32 (c0[2]); + c0[3] = swap32 (c0[3]); + c1[0] = swap32 (c1[0]); + c1[1] = swap32 (c1[1]); + c1[2] = swap32 (c1[2]); + c1[3] = swap32 (c1[3]); + c2[0] = swap32 (c2[0]); + c2[1] = swap32 (c2[1]); + c2[2] = swap32 (c2[2]); + c2[3] = swap32 (c2[3]); + c3[0] = swap32 (c3[0]); + c3[1] = swap32 (c3[1]); + c3[2] = swap32 (c3[2]); + c3[3] = swap32 (c3[3]); + #endif + + #ifdef IS_NV + // todo switch (offset / 4) { case 0: @@ -4598,9 +4943,10 @@ inline void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4] break; } + #endif } -inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -5255,7 +5601,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif } -inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) +void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -6182,459 +6528,291 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] #endif } -inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); switch (offset / 4) { - case 0: - w7[3] = amd_bytealign (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + case 0: + w7[3] = amd_bytealign (w7[2], w7[3], offset); + w7[2] = amd_bytealign (w7[1], w7[2], offset); + w7[1] = amd_bytealign (w7[0], w7[1], offset); + w7[0] = amd_bytealign (w6[3], w7[0], offset); + w6[3] = amd_bytealign (w6[2], w6[3], offset); + w6[2] = amd_bytealign (w6[1], w6[2], offset); + w6[1] = amd_bytealign (w6[0], w6[1], offset); + w6[0] = amd_bytealign (w5[3], w6[0], offset); + w5[3] = amd_bytealign (w5[2], w5[3], offset); + w5[2] = amd_bytealign (w5[1], w5[2], offset); + w5[1] = amd_bytealign (w5[0], w5[1], offset); + w5[0] = amd_bytealign (w4[3], w5[0], offset); + w4[3] = amd_bytealign (w4[2], w4[3], offset); + w4[2] = amd_bytealign (w4[1], w4[2], offset); + w4[1] = amd_bytealign (w4[0], w4[1], offset); + w4[0] = amd_bytealign (w3[3], w4[0], offset); + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; - case 1: - w7[3] = amd_bytealign (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 1: + w7[3] = amd_bytealign (w7[1], w7[2], offset); + w7[2] = amd_bytealign (w7[0], w7[1], offset); + w7[1] = amd_bytealign (w6[3], w7[0], offset); + w7[0] = amd_bytealign (w6[2], w6[3], offset); + w6[3] = amd_bytealign (w6[1], w6[2], offset); + w6[2] = amd_bytealign (w6[0], w6[1], offset); + w6[1] = amd_bytealign (w5[3], w6[0], offset); + w6[0] = amd_bytealign (w5[2], w5[3], offset); + w5[3] = amd_bytealign (w5[1], w5[2], offset); + w5[2] = amd_bytealign (w5[0], w5[1], offset); + w5[1] = amd_bytealign (w4[3], w5[0], offset); + w5[0] = amd_bytealign (w4[2], w4[3], offset); + w4[3] = amd_bytealign (w4[1], w4[2], offset); + w4[2] = amd_bytealign (w4[0], w4[1], offset); + w4[1] = amd_bytealign (w3[3], w4[0], offset); + w4[0] = amd_bytealign (w3[2], w3[3], offset); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 2: - w7[3] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + case 2: + w7[3] = amd_bytealign (w7[0], w7[1], offset); + w7[2] = amd_bytealign (w6[3], w7[0], offset); + w7[1] = amd_bytealign (w6[2], w6[3], offset); + w7[0] = amd_bytealign (w6[1], w6[2], offset); + w6[3] = amd_bytealign (w6[0], w6[1], offset); + w6[2] = amd_bytealign (w5[3], w6[0], offset); + w6[1] = amd_bytealign (w5[2], w5[3], offset); + w6[0] = amd_bytealign (w5[1], w5[2], offset); + w5[3] = amd_bytealign (w5[0], w5[1], offset); + w5[2] = amd_bytealign (w4[3], w5[0], offset); + w5[1] = amd_bytealign (w4[2], w4[3], offset); + w5[0] = amd_bytealign (w4[1], w4[2], offset); + w4[3] = amd_bytealign (w4[0], w4[1], offset); + w4[2] = amd_bytealign (w3[3], w4[0], offset); + w4[1] = amd_bytealign (w3[2], w3[3], offset); + w4[0] = amd_bytealign (w3[1], w3[2], offset); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 3: - w7[3] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + case 3: + w7[3] = amd_bytealign (w6[3], w7[0], offset); + w7[2] = amd_bytealign (w6[2], w6[3], offset); + w7[1] = amd_bytealign (w6[1], w6[2], offset); + w7[0] = amd_bytealign (w6[0], w6[1], offset); + w6[3] = amd_bytealign (w5[3], w6[0], offset); + w6[2] = amd_bytealign (w5[2], w5[3], offset); + w6[1] = amd_bytealign (w5[1], w5[2], offset); + w6[0] = amd_bytealign (w5[0], w5[1], offset); + w5[3] = amd_bytealign (w4[3], w5[0], offset); + w5[2] = amd_bytealign (w4[2], w4[3], offset); + w5[1] = amd_bytealign (w4[1], w4[2], offset); + w5[0] = amd_bytealign (w4[0], w4[1], offset); + w4[3] = amd_bytealign (w3[3], w4[0], offset); + w4[2] = amd_bytealign (w3[2], w3[3], offset); + w4[1] = amd_bytealign (w3[1], w3[2], offset); + w4[0] = amd_bytealign (w3[0], w3[1], offset); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 4: - w7[3] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 4: + w7[3] = amd_bytealign (w6[2], w6[3], offset); + w7[2] = amd_bytealign (w6[1], w6[2], offset); + w7[1] = amd_bytealign (w6[0], w6[1], offset); + w7[0] = amd_bytealign (w5[3], w6[0], offset); + w6[3] = amd_bytealign (w5[2], w5[3], offset); + w6[2] = amd_bytealign (w5[1], w5[2], offset); + w6[1] = amd_bytealign (w5[0], w5[1], offset); + w6[0] = amd_bytealign (w4[3], w5[0], offset); + w5[3] = amd_bytealign (w4[2], w4[3], offset); + w5[2] = amd_bytealign (w4[1], w4[2], offset); + w5[1] = amd_bytealign (w4[0], w4[1], offset); + w5[0] = amd_bytealign (w3[3], w4[0], offset); + w4[3] = amd_bytealign (w3[2], w3[3], offset); + w4[2] = amd_bytealign (w3[1], w3[2], offset); + w4[1] = amd_bytealign (w3[0], w3[1], offset); + w4[0] = amd_bytealign (w2[3], w3[0], offset); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 5: - w7[3] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 5: + w7[3] = amd_bytealign (w6[1], w6[2], offset); + w7[2] = amd_bytealign (w6[0], w6[1], offset); + w7[1] = amd_bytealign (w5[3], w6[0], offset); + w7[0] = amd_bytealign (w5[2], w5[3], offset); + w6[3] = amd_bytealign (w5[1], w5[2], offset); + w6[2] = amd_bytealign (w5[0], w5[1], offset); + w6[1] = amd_bytealign (w4[3], w5[0], offset); + w6[0] = amd_bytealign (w4[2], w4[3], offset); + w5[3] = amd_bytealign (w4[1], w4[2], offset); + w5[2] = amd_bytealign (w4[0], w4[1], offset); + w5[1] = amd_bytealign (w3[3], w4[0], offset); + w5[0] = amd_bytealign (w3[2], w3[3], offset); + w4[3] = amd_bytealign (w3[1], w3[2], offset); + w4[2] = amd_bytealign (w3[0], w3[1], offset); + w4[1] = amd_bytealign (w2[3], w3[0], offset); + w4[0] = amd_bytealign (w2[2], w2[3], offset); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 6: - w7[3] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + case 6: + w7[3] = amd_bytealign (w6[0], w6[1], offset); + w7[2] = amd_bytealign (w5[3], w6[0], offset); + w7[1] = amd_bytealign (w5[2], w5[3], offset); + w7[0] = amd_bytealign (w5[1], w5[2], offset); + w6[3] = amd_bytealign (w5[0], w5[1], offset); + w6[2] = amd_bytealign (w4[3], w5[0], offset); + w6[1] = amd_bytealign (w4[2], w4[3], offset); + w6[0] = amd_bytealign (w4[1], w4[2], offset); + w5[3] = amd_bytealign (w4[0], w4[1], offset); + w5[2] = amd_bytealign (w3[3], w4[0], offset); + w5[1] = amd_bytealign (w3[2], w3[3], offset); + w5[0] = amd_bytealign (w3[1], w3[2], offset); + w4[3] = amd_bytealign (w3[0], w3[1], offset); + w4[2] = amd_bytealign (w2[3], w3[0], offset); + w4[1] = amd_bytealign (w2[2], w2[3], offset); + w4[0] = amd_bytealign (w2[1], w2[2], offset); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -6642,64 +6820,34 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 7: - w7[3] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + case 7: + w7[3] = amd_bytealign (w5[3], w6[0], offset); + w7[2] = amd_bytealign (w5[2], w5[3], offset); + w7[1] = amd_bytealign (w5[1], w5[2], offset); + w7[0] = amd_bytealign (w5[0], w5[1], offset); + w6[3] = amd_bytealign (w4[3], w5[0], offset); + w6[2] = amd_bytealign (w4[2], w4[3], offset); + w6[1] = amd_bytealign (w4[1], w4[2], offset); + w6[0] = amd_bytealign (w4[0], w4[1], offset); + w5[3] = amd_bytealign (w3[3], w4[0], offset); + w5[2] = amd_bytealign (w3[2], w3[3], offset); + w5[1] = amd_bytealign (w3[1], w3[2], offset); + w5[0] = amd_bytealign (w3[0], w3[1], offset); + w4[3] = amd_bytealign (w2[3], w3[0], offset); + w4[2] = amd_bytealign (w2[2], w2[3], offset); + w4[1] = amd_bytealign (w2[1], w2[2], offset); + w4[0] = amd_bytealign (w2[0], w2[1], offset); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -6708,62 +6856,33 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 8: - w7[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 8: + w7[3] = amd_bytealign (w5[2], w5[3], offset); + w7[2] = amd_bytealign (w5[1], w5[2], offset); + w7[1] = amd_bytealign (w5[0], w5[1], offset); + w7[0] = amd_bytealign (w4[3], w5[0], offset); + w6[3] = amd_bytealign (w4[2], w4[3], offset); + w6[2] = amd_bytealign (w4[1], w4[2], offset); + w6[1] = amd_bytealign (w4[0], w4[1], offset); + w6[0] = amd_bytealign (w3[3], w4[0], offset); + w5[3] = amd_bytealign (w3[2], w3[3], offset); + w5[2] = amd_bytealign (w3[1], w3[2], offset); + w5[1] = amd_bytealign (w3[0], w3[1], offset); + w5[0] = amd_bytealign (w2[3], w3[0], offset); + w4[3] = amd_bytealign (w2[2], w2[3], offset); + w4[2] = amd_bytealign (w2[1], w2[2], offset); + w4[1] = amd_bytealign (w2[0], w2[1], offset); + w4[0] = amd_bytealign (w1[3], w2[0], offset); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -6773,60 +6892,32 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 9: - w7[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 9: + w7[3] = amd_bytealign (w5[1], w5[2], offset); + w7[2] = amd_bytealign (w5[0], w5[1], offset); + w7[1] = amd_bytealign (w4[3], w5[0], offset); + w7[0] = amd_bytealign (w4[2], w4[3], offset); + w6[3] = amd_bytealign (w4[1], w4[2], offset); + w6[2] = amd_bytealign (w4[0], w4[1], offset); + w6[1] = amd_bytealign (w3[3], w4[0], offset); + w6[0] = amd_bytealign (w3[2], w3[3], offset); + w5[3] = amd_bytealign (w3[1], w3[2], offset); + w5[2] = amd_bytealign (w3[0], w3[1], offset); + w5[1] = amd_bytealign (w2[3], w3[0], offset); + w5[0] = amd_bytealign (w2[2], w2[3], offset); + w4[3] = amd_bytealign (w2[1], w2[2], offset); + w4[2] = amd_bytealign (w2[0], w2[1], offset); + w4[1] = amd_bytealign (w1[3], w2[0], offset); + w4[0] = amd_bytealign (w1[2], w1[3], offset); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -6837,58 +6928,31 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 10: - w7[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w5[0], w5[1], offset); + w7[2] = amd_bytealign (w4[3], w5[0], offset); + w7[1] = amd_bytealign (w4[2], w4[3], offset); + w7[0] = amd_bytealign (w4[1], w4[2], offset); + w6[3] = amd_bytealign (w4[0], w4[1], offset); + w6[2] = amd_bytealign (w3[3], w4[0], offset); + w6[1] = amd_bytealign (w3[2], w3[3], offset); + w6[0] = amd_bytealign (w3[1], w3[2], offset); + w5[3] = amd_bytealign (w3[0], w3[1], offset); + w5[2] = amd_bytealign (w2[3], w3[0], offset); + w5[1] = amd_bytealign (w2[2], w2[3], offset); + w5[0] = amd_bytealign (w2[1], w2[2], offset); + w4[3] = amd_bytealign (w2[0], w2[1], offset); + w4[2] = amd_bytealign (w1[3], w2[0], offset); + w4[1] = amd_bytealign (w1[2], w1[3], offset); + w4[0] = amd_bytealign (w1[1], w1[2], offset); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -6900,56 +6964,30 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 11: - w7[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[3], w5[0], offset); + w7[2] = amd_bytealign (w4[2], w4[3], offset); + w7[1] = amd_bytealign (w4[1], w4[2], offset); + w7[0] = amd_bytealign (w4[0], w4[1], offset); + w6[3] = amd_bytealign (w3[3], w4[0], offset); + w6[2] = amd_bytealign (w3[2], w3[3], offset); + w6[1] = amd_bytealign (w3[1], w3[2], offset); + w6[0] = amd_bytealign (w3[0], w3[1], offset); + w5[3] = amd_bytealign (w2[3], w3[0], offset); + w5[2] = amd_bytealign (w2[2], w2[3], offset); + w5[1] = amd_bytealign (w2[1], w2[2], offset); + w5[0] = amd_bytealign (w2[0], w2[1], offset); + w4[3] = amd_bytealign (w1[3], w2[0], offset); + w4[2] = amd_bytealign (w1[2], w1[3], offset); + w4[1] = amd_bytealign (w1[1], w1[2], offset); + w4[0] = amd_bytealign (w1[0], w1[1], offset); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -6962,54 +7000,29 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 12: - w7[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[2], w4[3], offset); + w7[2] = amd_bytealign (w4[1], w4[2], offset); + w7[1] = amd_bytealign (w4[0], w4[1], offset); + w7[0] = amd_bytealign (w3[3], w4[0], offset); + w6[3] = amd_bytealign (w3[2], w3[3], offset); + w6[2] = amd_bytealign (w3[1], w3[2], offset); + w6[1] = amd_bytealign (w3[0], w3[1], offset); + w6[0] = amd_bytealign (w2[3], w3[0], offset); + w5[3] = amd_bytealign (w2[2], w2[3], offset); + w5[2] = amd_bytealign (w2[1], w2[2], offset); + w5[1] = amd_bytealign (w2[0], w2[1], offset); + w5[0] = amd_bytealign (w1[3], w2[0], offset); + w4[3] = amd_bytealign (w1[2], w1[3], offset); + w4[2] = amd_bytealign (w1[1], w1[2], offset); + w4[1] = amd_bytealign (w1[0], w1[1], offset); + w4[0] = amd_bytealign (w0[3], w1[0], offset); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -7023,52 +7036,28 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 13: - w7[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[1], w4[2], offset); + w7[2] = amd_bytealign (w4[0], w4[1], offset); + w7[1] = amd_bytealign (w3[3], w4[0], offset); + w7[0] = amd_bytealign (w3[2], w3[3], offset); + w6[3] = amd_bytealign (w3[1], w3[2], offset); + w6[2] = amd_bytealign (w3[0], w3[1], offset); + w6[1] = amd_bytealign (w2[3], w3[0], offset); + w6[0] = amd_bytealign (w2[2], w2[3], offset); + w5[3] = amd_bytealign (w2[1], w2[2], offset); + w5[2] = amd_bytealign (w2[0], w2[1], offset); + w5[1] = amd_bytealign (w1[3], w2[0], offset); + w5[0] = amd_bytealign (w1[2], w1[3], offset); + w4[3] = amd_bytealign (w1[1], w1[2], offset); + w4[2] = amd_bytealign (w1[0], w1[1], offset); + w4[1] = amd_bytealign (w0[3], w1[0], offset); + w4[0] = amd_bytealign (w0[2], w0[3], offset); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -7083,50 +7072,27 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 14: - w7[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[0], w4[1], offset); + w7[2] = amd_bytealign (w3[3], w4[0], offset); + w7[1] = amd_bytealign (w3[2], w3[3], offset); + w7[0] = amd_bytealign (w3[1], w3[2], offset); + w6[3] = amd_bytealign (w3[0], w3[1], offset); + w6[2] = amd_bytealign (w2[3], w3[0], offset); + w6[1] = amd_bytealign (w2[2], w2[3], offset); + w6[0] = amd_bytealign (w2[1], w2[2], offset); + w5[3] = amd_bytealign (w2[0], w2[1], offset); + w5[2] = amd_bytealign (w1[3], w2[0], offset); + w5[1] = amd_bytealign (w1[2], w1[3], offset); + w5[0] = amd_bytealign (w1[1], w1[2], offset); + w4[3] = amd_bytealign (w1[0], w1[1], offset); + w4[2] = amd_bytealign (w0[3], w1[0], offset); + w4[1] = amd_bytealign (w0[2], w0[3], offset); + w4[0] = amd_bytealign (w0[1], w0[2], offset); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -7142,48 +7108,26 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 15: - w7[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w3[3], w4[0], offset); + w7[2] = amd_bytealign (w3[2], w3[3], offset); + w7[1] = amd_bytealign (w3[1], w3[2], offset); + w7[0] = amd_bytealign (w3[0], w3[1], offset); + w6[3] = amd_bytealign (w2[3], w3[0], offset); + w6[2] = amd_bytealign (w2[2], w2[3], offset); + w6[1] = amd_bytealign (w2[1], w2[2], offset); + w6[0] = amd_bytealign (w2[0], w2[1], offset); + w5[3] = amd_bytealign (w1[3], w2[0], offset); + w5[2] = amd_bytealign (w1[2], w1[3], offset); + w5[1] = amd_bytealign (w1[1], w1[2], offset); + w5[0] = amd_bytealign (w1[0], w1[1], offset); + w4[3] = amd_bytealign (w0[3], w1[0], offset); + w4[2] = amd_bytealign (w0[2], w0[3], offset); + w4[1] = amd_bytealign (w0[1], w0[2], offset); + w4[0] = amd_bytealign (w0[0], w0[1], offset); + w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -7200,34 +7144,620 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 16: + w7[3] = amd_bytealign (w3[2], w3[3], offset); + w7[2] = amd_bytealign (w3[1], w3[2], offset); + w7[1] = amd_bytealign (w3[0], w3[1], offset); + w7[0] = amd_bytealign (w2[3], w3[0], offset); + w6[3] = amd_bytealign (w2[2], w2[3], offset); + w6[2] = amd_bytealign (w2[1], w2[2], offset); + w6[1] = amd_bytealign (w2[0], w2[1], offset); + w6[0] = amd_bytealign (w1[3], w2[0], offset); + w5[3] = amd_bytealign (w1[2], w1[3], offset); + w5[2] = amd_bytealign (w1[1], w1[2], offset); + w5[1] = amd_bytealign (w1[0], w1[1], offset); + w5[0] = amd_bytealign (w0[3], w1[0], offset); + w4[3] = amd_bytealign (w0[2], w0[3], offset); + w4[2] = amd_bytealign (w0[1], w0[2], offset); + w4[1] = amd_bytealign (w0[0], w0[1], offset); + w4[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + w7[3] = amd_bytealign (w3[1], w3[2], offset); + w7[2] = amd_bytealign (w3[0], w3[1], offset); + w7[1] = amd_bytealign (w2[3], w3[0], offset); + w7[0] = amd_bytealign (w2[2], w2[3], offset); + w6[3] = amd_bytealign (w2[1], w2[2], offset); + w6[2] = amd_bytealign (w2[0], w2[1], offset); + w6[1] = amd_bytealign (w1[3], w2[0], offset); + w6[0] = amd_bytealign (w1[2], w1[3], offset); + w5[3] = amd_bytealign (w1[1], w1[2], offset); + w5[2] = amd_bytealign (w1[0], w1[1], offset); + w5[1] = amd_bytealign (w0[3], w1[0], offset); + w5[0] = amd_bytealign (w0[2], w0[3], offset); + w4[3] = amd_bytealign (w0[1], w0[2], offset); + w4[2] = amd_bytealign (w0[0], w0[1], offset); + w4[1] = amd_bytealign ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + w7[3] = amd_bytealign (w3[0], w3[1], offset); + w7[2] = amd_bytealign (w2[3], w3[0], offset); + w7[1] = amd_bytealign (w2[2], w2[3], offset); + w7[0] = amd_bytealign (w2[1], w2[2], offset); + w6[3] = amd_bytealign (w2[0], w2[1], offset); + w6[2] = amd_bytealign (w1[3], w2[0], offset); + w6[1] = amd_bytealign (w1[2], w1[3], offset); + w6[0] = amd_bytealign (w1[1], w1[2], offset); + w5[3] = amd_bytealign (w1[0], w1[1], offset); + w5[2] = amd_bytealign (w0[3], w1[0], offset); + w5[1] = amd_bytealign (w0[2], w0[3], offset); + w5[0] = amd_bytealign (w0[1], w0[2], offset); + w4[3] = amd_bytealign (w0[0], w0[1], offset); + w4[2] = amd_bytealign ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + w7[3] = amd_bytealign (w2[3], w3[0], offset); + w7[2] = amd_bytealign (w2[2], w2[3], offset); + w7[1] = amd_bytealign (w2[1], w2[2], offset); + w7[0] = amd_bytealign (w2[0], w2[1], offset); + w6[3] = amd_bytealign (w1[3], w2[0], offset); + w6[2] = amd_bytealign (w1[2], w1[3], offset); + w6[1] = amd_bytealign (w1[1], w1[2], offset); + w6[0] = amd_bytealign (w1[0], w1[1], offset); + w5[3] = amd_bytealign (w0[3], w1[0], offset); + w5[2] = amd_bytealign (w0[2], w0[3], offset); + w5[1] = amd_bytealign (w0[1], w0[2], offset); + w5[0] = amd_bytealign (w0[0], w0[1], offset); + w4[3] = amd_bytealign ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + w7[3] = amd_bytealign (w2[2], w2[3], offset); + w7[2] = amd_bytealign (w2[1], w2[2], offset); + w7[1] = amd_bytealign (w2[0], w2[1], offset); + w7[0] = amd_bytealign (w1[3], w2[0], offset); + w6[3] = amd_bytealign (w1[2], w1[3], offset); + w6[2] = amd_bytealign (w1[1], w1[2], offset); + w6[1] = amd_bytealign (w1[0], w1[1], offset); + w6[0] = amd_bytealign (w0[3], w1[0], offset); + w5[3] = amd_bytealign (w0[2], w0[3], offset); + w5[2] = amd_bytealign (w0[1], w0[2], offset); + w5[1] = amd_bytealign (w0[0], w0[1], offset); + w5[0] = amd_bytealign ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + w7[3] = amd_bytealign (w2[1], w2[2], offset); + w7[2] = amd_bytealign (w2[0], w2[1], offset); + w7[1] = amd_bytealign (w1[3], w2[0], offset); + w7[0] = amd_bytealign (w1[2], w1[3], offset); + w6[3] = amd_bytealign (w1[1], w1[2], offset); + w6[2] = amd_bytealign (w1[0], w1[1], offset); + w6[1] = amd_bytealign (w0[3], w1[0], offset); + w6[0] = amd_bytealign (w0[2], w0[3], offset); + w5[3] = amd_bytealign (w0[1], w0[2], offset); + w5[2] = amd_bytealign (w0[0], w0[1], offset); + w5[1] = amd_bytealign ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + w7[3] = amd_bytealign (w2[0], w2[1], offset); + w7[2] = amd_bytealign (w1[3], w2[0], offset); + w7[1] = amd_bytealign (w1[2], w1[3], offset); + w7[0] = amd_bytealign (w1[1], w1[2], offset); + w6[3] = amd_bytealign (w1[0], w1[1], offset); + w6[2] = amd_bytealign (w0[3], w1[0], offset); + w6[1] = amd_bytealign (w0[2], w0[3], offset); + w6[0] = amd_bytealign (w0[1], w0[2], offset); + w5[3] = amd_bytealign (w0[0], w0[1], offset); + w5[2] = amd_bytealign ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + w7[3] = amd_bytealign (w1[3], w2[0], offset); + w7[2] = amd_bytealign (w1[2], w1[3], offset); + w7[1] = amd_bytealign (w1[1], w1[2], offset); + w7[0] = amd_bytealign (w1[0], w1[1], offset); + w6[3] = amd_bytealign (w0[3], w1[0], offset); + w6[2] = amd_bytealign (w0[2], w0[3], offset); + w6[1] = amd_bytealign (w0[1], w0[2], offset); + w6[0] = amd_bytealign (w0[0], w0[1], offset); + w5[3] = amd_bytealign ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + w7[3] = amd_bytealign (w1[2], w1[3], offset); + w7[2] = amd_bytealign (w1[1], w1[2], offset); + w7[1] = amd_bytealign (w1[0], w1[1], offset); + w7[0] = amd_bytealign (w0[3], w1[0], offset); + w6[3] = amd_bytealign (w0[2], w0[3], offset); + w6[2] = amd_bytealign (w0[1], w0[2], offset); + w6[1] = amd_bytealign (w0[0], w0[1], offset); + w6[0] = amd_bytealign ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + w7[3] = amd_bytealign (w1[1], w1[2], offset); + w7[2] = amd_bytealign (w1[0], w1[1], offset); + w7[1] = amd_bytealign (w0[3], w1[0], offset); + w7[0] = amd_bytealign (w0[2], w0[3], offset); + w6[3] = amd_bytealign (w0[1], w0[2], offset); + w6[2] = amd_bytealign (w0[0], w0[1], offset); + w6[1] = amd_bytealign ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + w7[3] = amd_bytealign (w1[0], w1[1], offset); + w7[2] = amd_bytealign (w0[3], w1[0], offset); + w7[1] = amd_bytealign (w0[2], w0[3], offset); + w7[0] = amd_bytealign (w0[1], w0[2], offset); + w6[3] = amd_bytealign (w0[0], w0[1], offset); + w6[2] = amd_bytealign ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + w7[3] = amd_bytealign (w0[3], w1[0], offset); + w7[2] = amd_bytealign (w0[2], w0[3], offset); + w7[1] = amd_bytealign (w0[1], w0[2], offset); + w7[0] = amd_bytealign (w0[0], w0[1], offset); + w6[3] = amd_bytealign ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + w7[3] = amd_bytealign (w0[2], w0[3], offset); + w7[2] = amd_bytealign (w0[1], w0[2], offset); + w7[1] = amd_bytealign (w0[0], w0[1], offset); + w7[0] = amd_bytealign ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + w7[3] = amd_bytealign (w0[1], w0[2], offset); + w7[2] = amd_bytealign (w0[0], w0[1], offset); + w7[1] = amd_bytealign ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + w7[3] = amd_bytealign (w0[0], w0[1], offset); + w7[2] = amd_bytealign ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + w7[3] = amd_bytealign ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -7795,7 +8325,7 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], #endif } -inline void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -10114,7 +10644,7 @@ inline void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], #endif } -inline void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) +void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -13489,7 +14019,17457 @@ inline void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w #endif } -inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) +void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset) +{ + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); + + switch (offset / 4) + { + case 0: + w[63] = amd_bytealign (w[62], w[63], offset); + w[62] = amd_bytealign (w[61], w[62], offset); + w[61] = amd_bytealign (w[60], w[61], offset); + w[60] = amd_bytealign (w[59], w[60], offset); + w[59] = amd_bytealign (w[58], w[59], offset); + w[58] = amd_bytealign (w[57], w[58], offset); + w[57] = amd_bytealign (w[56], w[57], offset); + w[56] = amd_bytealign (w[55], w[56], offset); + w[55] = amd_bytealign (w[54], w[55], offset); + w[54] = amd_bytealign (w[53], w[54], offset); + w[53] = amd_bytealign (w[52], w[53], offset); + w[52] = amd_bytealign (w[51], w[52], offset); + w[51] = amd_bytealign (w[50], w[51], offset); + w[50] = amd_bytealign (w[49], w[50], offset); + w[49] = amd_bytealign (w[48], w[49], offset); + w[48] = amd_bytealign (w[47], w[48], offset); + w[47] = amd_bytealign (w[46], w[47], offset); + w[46] = amd_bytealign (w[45], w[46], offset); + w[45] = amd_bytealign (w[44], w[45], offset); + w[44] = amd_bytealign (w[43], w[44], offset); + w[43] = amd_bytealign (w[42], w[43], offset); + w[42] = amd_bytealign (w[41], w[42], offset); + w[41] = amd_bytealign (w[40], w[41], offset); + w[40] = amd_bytealign (w[39], w[40], offset); + w[39] = amd_bytealign (w[38], w[39], offset); + w[38] = amd_bytealign (w[37], w[38], offset); + w[37] = amd_bytealign (w[36], w[37], offset); + w[36] = amd_bytealign (w[35], w[36], offset); + w[35] = amd_bytealign (w[34], w[35], offset); + w[34] = amd_bytealign (w[33], w[34], offset); + w[33] = amd_bytealign (w[32], w[33], offset); + w[32] = amd_bytealign (w[31], w[32], offset); + w[31] = amd_bytealign (w[30], w[31], offset); + w[30] = amd_bytealign (w[29], w[30], offset); + w[29] = amd_bytealign (w[28], w[29], offset); + w[28] = amd_bytealign (w[27], w[28], offset); + w[27] = amd_bytealign (w[26], w[27], offset); + w[26] = amd_bytealign (w[25], w[26], offset); + w[25] = amd_bytealign (w[24], w[25], offset); + w[24] = amd_bytealign (w[23], w[24], offset); + w[23] = amd_bytealign (w[22], w[23], offset); + w[22] = amd_bytealign (w[21], w[22], offset); + w[21] = amd_bytealign (w[20], w[21], offset); + w[20] = amd_bytealign (w[19], w[20], offset); + w[19] = amd_bytealign (w[18], w[19], offset); + w[18] = amd_bytealign (w[17], w[18], offset); + w[17] = amd_bytealign (w[16], w[17], offset); + w[16] = amd_bytealign (w[15], w[16], offset); + w[15] = amd_bytealign (w[14], w[15], offset); + w[14] = amd_bytealign (w[13], w[14], offset); + w[13] = amd_bytealign (w[12], w[13], offset); + w[12] = amd_bytealign (w[11], w[12], offset); + w[11] = amd_bytealign (w[10], w[11], offset); + w[10] = amd_bytealign (w[ 9], w[10], offset); + w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = amd_bytealign (w[61], w[62], offset); + w[62] = amd_bytealign (w[60], w[61], offset); + w[61] = amd_bytealign (w[59], w[60], offset); + w[60] = amd_bytealign (w[58], w[59], offset); + w[59] = amd_bytealign (w[57], w[58], offset); + w[58] = amd_bytealign (w[56], w[57], offset); + w[57] = amd_bytealign (w[55], w[56], offset); + w[56] = amd_bytealign (w[54], w[55], offset); + w[55] = amd_bytealign (w[53], w[54], offset); + w[54] = amd_bytealign (w[52], w[53], offset); + w[53] = amd_bytealign (w[51], w[52], offset); + w[52] = amd_bytealign (w[50], w[51], offset); + w[51] = amd_bytealign (w[49], w[50], offset); + w[50] = amd_bytealign (w[48], w[49], offset); + w[49] = amd_bytealign (w[47], w[48], offset); + w[48] = amd_bytealign (w[46], w[47], offset); + w[47] = amd_bytealign (w[45], w[46], offset); + w[46] = amd_bytealign (w[44], w[45], offset); + w[45] = amd_bytealign (w[43], w[44], offset); + w[44] = amd_bytealign (w[42], w[43], offset); + w[43] = amd_bytealign (w[41], w[42], offset); + w[42] = amd_bytealign (w[40], w[41], offset); + w[41] = amd_bytealign (w[39], w[40], offset); + w[40] = amd_bytealign (w[38], w[39], offset); + w[39] = amd_bytealign (w[37], w[38], offset); + w[38] = amd_bytealign (w[36], w[37], offset); + w[37] = amd_bytealign (w[35], w[36], offset); + w[36] = amd_bytealign (w[34], w[35], offset); + w[35] = amd_bytealign (w[33], w[34], offset); + w[34] = amd_bytealign (w[32], w[33], offset); + w[33] = amd_bytealign (w[31], w[32], offset); + w[32] = amd_bytealign (w[30], w[31], offset); + w[31] = amd_bytealign (w[29], w[30], offset); + w[30] = amd_bytealign (w[28], w[29], offset); + w[29] = amd_bytealign (w[27], w[28], offset); + w[28] = amd_bytealign (w[26], w[27], offset); + w[27] = amd_bytealign (w[25], w[26], offset); + w[26] = amd_bytealign (w[24], w[25], offset); + w[25] = amd_bytealign (w[23], w[24], offset); + w[24] = amd_bytealign (w[22], w[23], offset); + w[23] = amd_bytealign (w[21], w[22], offset); + w[22] = amd_bytealign (w[20], w[21], offset); + w[21] = amd_bytealign (w[19], w[20], offset); + w[20] = amd_bytealign (w[18], w[19], offset); + w[19] = amd_bytealign (w[17], w[18], offset); + w[18] = amd_bytealign (w[16], w[17], offset); + w[17] = amd_bytealign (w[15], w[16], offset); + w[16] = amd_bytealign (w[14], w[15], offset); + w[15] = amd_bytealign (w[13], w[14], offset); + w[14] = amd_bytealign (w[12], w[13], offset); + w[13] = amd_bytealign (w[11], w[12], offset); + w[12] = amd_bytealign (w[10], w[11], offset); + w[11] = amd_bytealign (w[ 9], w[10], offset); + w[10] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[ 0] = 0; + + break; + + case 2: + w[63] = amd_bytealign (w[60], w[61], offset); + w[62] = amd_bytealign (w[59], w[60], offset); + w[61] = amd_bytealign (w[58], w[59], offset); + w[60] = amd_bytealign (w[57], w[58], offset); + w[59] = amd_bytealign (w[56], w[57], offset); + w[58] = amd_bytealign (w[55], w[56], offset); + w[57] = amd_bytealign (w[54], w[55], offset); + w[56] = amd_bytealign (w[53], w[54], offset); + w[55] = amd_bytealign (w[52], w[53], offset); + w[54] = amd_bytealign (w[51], w[52], offset); + w[53] = amd_bytealign (w[50], w[51], offset); + w[52] = amd_bytealign (w[49], w[50], offset); + w[51] = amd_bytealign (w[48], w[49], offset); + w[50] = amd_bytealign (w[47], w[48], offset); + w[49] = amd_bytealign (w[46], w[47], offset); + w[48] = amd_bytealign (w[45], w[46], offset); + w[47] = amd_bytealign (w[44], w[45], offset); + w[46] = amd_bytealign (w[43], w[44], offset); + w[45] = amd_bytealign (w[42], w[43], offset); + w[44] = amd_bytealign (w[41], w[42], offset); + w[43] = amd_bytealign (w[40], w[41], offset); + w[42] = amd_bytealign (w[39], w[40], offset); + w[41] = amd_bytealign (w[38], w[39], offset); + w[40] = amd_bytealign (w[37], w[38], offset); + w[39] = amd_bytealign (w[36], w[37], offset); + w[38] = amd_bytealign (w[35], w[36], offset); + w[37] = amd_bytealign (w[34], w[35], offset); + w[36] = amd_bytealign (w[33], w[34], offset); + w[35] = amd_bytealign (w[32], w[33], offset); + w[34] = amd_bytealign (w[31], w[32], offset); + w[33] = amd_bytealign (w[30], w[31], offset); + w[32] = amd_bytealign (w[29], w[30], offset); + w[31] = amd_bytealign (w[28], w[29], offset); + w[30] = amd_bytealign (w[27], w[28], offset); + w[29] = amd_bytealign (w[26], w[27], offset); + w[28] = amd_bytealign (w[25], w[26], offset); + w[27] = amd_bytealign (w[24], w[25], offset); + w[26] = amd_bytealign (w[23], w[24], offset); + w[25] = amd_bytealign (w[22], w[23], offset); + w[24] = amd_bytealign (w[21], w[22], offset); + w[23] = amd_bytealign (w[20], w[21], offset); + w[22] = amd_bytealign (w[19], w[20], offset); + w[21] = amd_bytealign (w[18], w[19], offset); + w[20] = amd_bytealign (w[17], w[18], offset); + w[19] = amd_bytealign (w[16], w[17], offset); + w[18] = amd_bytealign (w[15], w[16], offset); + w[17] = amd_bytealign (w[14], w[15], offset); + w[16] = amd_bytealign (w[13], w[14], offset); + w[15] = amd_bytealign (w[12], w[13], offset); + w[14] = amd_bytealign (w[11], w[12], offset); + w[13] = amd_bytealign (w[10], w[11], offset); + w[12] = amd_bytealign (w[ 9], w[10], offset); + w[11] = amd_bytealign (w[ 8], w[ 9], offset); + w[10] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = amd_bytealign (w[59], w[60], offset); + w[62] = amd_bytealign (w[58], w[59], offset); + w[61] = amd_bytealign (w[57], w[58], offset); + w[60] = amd_bytealign (w[56], w[57], offset); + w[59] = amd_bytealign (w[55], w[56], offset); + w[58] = amd_bytealign (w[54], w[55], offset); + w[57] = amd_bytealign (w[53], w[54], offset); + w[56] = amd_bytealign (w[52], w[53], offset); + w[55] = amd_bytealign (w[51], w[52], offset); + w[54] = amd_bytealign (w[50], w[51], offset); + w[53] = amd_bytealign (w[49], w[50], offset); + w[52] = amd_bytealign (w[48], w[49], offset); + w[51] = amd_bytealign (w[47], w[48], offset); + w[50] = amd_bytealign (w[46], w[47], offset); + w[49] = amd_bytealign (w[45], w[46], offset); + w[48] = amd_bytealign (w[44], w[45], offset); + w[47] = amd_bytealign (w[43], w[44], offset); + w[46] = amd_bytealign (w[42], w[43], offset); + w[45] = amd_bytealign (w[41], w[42], offset); + w[44] = amd_bytealign (w[40], w[41], offset); + w[43] = amd_bytealign (w[39], w[40], offset); + w[42] = amd_bytealign (w[38], w[39], offset); + w[41] = amd_bytealign (w[37], w[38], offset); + w[40] = amd_bytealign (w[36], w[37], offset); + w[39] = amd_bytealign (w[35], w[36], offset); + w[38] = amd_bytealign (w[34], w[35], offset); + w[37] = amd_bytealign (w[33], w[34], offset); + w[36] = amd_bytealign (w[32], w[33], offset); + w[35] = amd_bytealign (w[31], w[32], offset); + w[34] = amd_bytealign (w[30], w[31], offset); + w[33] = amd_bytealign (w[29], w[30], offset); + w[32] = amd_bytealign (w[28], w[29], offset); + w[31] = amd_bytealign (w[27], w[28], offset); + w[30] = amd_bytealign (w[26], w[27], offset); + w[29] = amd_bytealign (w[25], w[26], offset); + w[28] = amd_bytealign (w[24], w[25], offset); + w[27] = amd_bytealign (w[23], w[24], offset); + w[26] = amd_bytealign (w[22], w[23], offset); + w[25] = amd_bytealign (w[21], w[22], offset); + w[24] = amd_bytealign (w[20], w[21], offset); + w[23] = amd_bytealign (w[19], w[20], offset); + w[22] = amd_bytealign (w[18], w[19], offset); + w[21] = amd_bytealign (w[17], w[18], offset); + w[20] = amd_bytealign (w[16], w[17], offset); + w[19] = amd_bytealign (w[15], w[16], offset); + w[18] = amd_bytealign (w[14], w[15], offset); + w[17] = amd_bytealign (w[13], w[14], offset); + w[16] = amd_bytealign (w[12], w[13], offset); + w[15] = amd_bytealign (w[11], w[12], offset); + w[14] = amd_bytealign (w[10], w[11], offset); + w[13] = amd_bytealign (w[ 9], w[10], offset); + w[12] = amd_bytealign (w[ 8], w[ 9], offset); + w[11] = amd_bytealign (w[ 7], w[ 8], offset); + w[10] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = amd_bytealign (w[58], w[59], offset); + w[62] = amd_bytealign (w[57], w[58], offset); + w[61] = amd_bytealign (w[56], w[57], offset); + w[60] = amd_bytealign (w[55], w[56], offset); + w[59] = amd_bytealign (w[54], w[55], offset); + w[58] = amd_bytealign (w[53], w[54], offset); + w[57] = amd_bytealign (w[52], w[53], offset); + w[56] = amd_bytealign (w[51], w[52], offset); + w[55] = amd_bytealign (w[50], w[51], offset); + w[54] = amd_bytealign (w[49], w[50], offset); + w[53] = amd_bytealign (w[48], w[49], offset); + w[52] = amd_bytealign (w[47], w[48], offset); + w[51] = amd_bytealign (w[46], w[47], offset); + w[50] = amd_bytealign (w[45], w[46], offset); + w[49] = amd_bytealign (w[44], w[45], offset); + w[48] = amd_bytealign (w[43], w[44], offset); + w[47] = amd_bytealign (w[42], w[43], offset); + w[46] = amd_bytealign (w[41], w[42], offset); + w[45] = amd_bytealign (w[40], w[41], offset); + w[44] = amd_bytealign (w[39], w[40], offset); + w[43] = amd_bytealign (w[38], w[39], offset); + w[42] = amd_bytealign (w[37], w[38], offset); + w[41] = amd_bytealign (w[36], w[37], offset); + w[40] = amd_bytealign (w[35], w[36], offset); + w[39] = amd_bytealign (w[34], w[35], offset); + w[38] = amd_bytealign (w[33], w[34], offset); + w[37] = amd_bytealign (w[32], w[33], offset); + w[36] = amd_bytealign (w[31], w[32], offset); + w[35] = amd_bytealign (w[30], w[31], offset); + w[34] = amd_bytealign (w[29], w[30], offset); + w[33] = amd_bytealign (w[28], w[29], offset); + w[32] = amd_bytealign (w[27], w[28], offset); + w[31] = amd_bytealign (w[26], w[27], offset); + w[30] = amd_bytealign (w[25], w[26], offset); + w[29] = amd_bytealign (w[24], w[25], offset); + w[28] = amd_bytealign (w[23], w[24], offset); + w[27] = amd_bytealign (w[22], w[23], offset); + w[26] = amd_bytealign (w[21], w[22], offset); + w[25] = amd_bytealign (w[20], w[21], offset); + w[24] = amd_bytealign (w[19], w[20], offset); + w[23] = amd_bytealign (w[18], w[19], offset); + w[22] = amd_bytealign (w[17], w[18], offset); + w[21] = amd_bytealign (w[16], w[17], offset); + w[20] = amd_bytealign (w[15], w[16], offset); + w[19] = amd_bytealign (w[14], w[15], offset); + w[18] = amd_bytealign (w[13], w[14], offset); + w[17] = amd_bytealign (w[12], w[13], offset); + w[16] = amd_bytealign (w[11], w[12], offset); + w[15] = amd_bytealign (w[10], w[11], offset); + w[14] = amd_bytealign (w[ 9], w[10], offset); + w[13] = amd_bytealign (w[ 8], w[ 9], offset); + w[12] = amd_bytealign (w[ 7], w[ 8], offset); + w[11] = amd_bytealign (w[ 6], w[ 7], offset); + w[10] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = amd_bytealign (w[57], w[58], offset); + w[62] = amd_bytealign (w[56], w[57], offset); + w[61] = amd_bytealign (w[55], w[56], offset); + w[60] = amd_bytealign (w[54], w[55], offset); + w[59] = amd_bytealign (w[53], w[54], offset); + w[58] = amd_bytealign (w[52], w[53], offset); + w[57] = amd_bytealign (w[51], w[52], offset); + w[56] = amd_bytealign (w[50], w[51], offset); + w[55] = amd_bytealign (w[49], w[50], offset); + w[54] = amd_bytealign (w[48], w[49], offset); + w[53] = amd_bytealign (w[47], w[48], offset); + w[52] = amd_bytealign (w[46], w[47], offset); + w[51] = amd_bytealign (w[45], w[46], offset); + w[50] = amd_bytealign (w[44], w[45], offset); + w[49] = amd_bytealign (w[43], w[44], offset); + w[48] = amd_bytealign (w[42], w[43], offset); + w[47] = amd_bytealign (w[41], w[42], offset); + w[46] = amd_bytealign (w[40], w[41], offset); + w[45] = amd_bytealign (w[39], w[40], offset); + w[44] = amd_bytealign (w[38], w[39], offset); + w[43] = amd_bytealign (w[37], w[38], offset); + w[42] = amd_bytealign (w[36], w[37], offset); + w[41] = amd_bytealign (w[35], w[36], offset); + w[40] = amd_bytealign (w[34], w[35], offset); + w[39] = amd_bytealign (w[33], w[34], offset); + w[38] = amd_bytealign (w[32], w[33], offset); + w[37] = amd_bytealign (w[31], w[32], offset); + w[36] = amd_bytealign (w[30], w[31], offset); + w[35] = amd_bytealign (w[29], w[30], offset); + w[34] = amd_bytealign (w[28], w[29], offset); + w[33] = amd_bytealign (w[27], w[28], offset); + w[32] = amd_bytealign (w[26], w[27], offset); + w[31] = amd_bytealign (w[25], w[26], offset); + w[30] = amd_bytealign (w[24], w[25], offset); + w[29] = amd_bytealign (w[23], w[24], offset); + w[28] = amd_bytealign (w[22], w[23], offset); + w[27] = amd_bytealign (w[21], w[22], offset); + w[26] = amd_bytealign (w[20], w[21], offset); + w[25] = amd_bytealign (w[19], w[20], offset); + w[24] = amd_bytealign (w[18], w[19], offset); + w[23] = amd_bytealign (w[17], w[18], offset); + w[22] = amd_bytealign (w[16], w[17], offset); + w[21] = amd_bytealign (w[15], w[16], offset); + w[20] = amd_bytealign (w[14], w[15], offset); + w[19] = amd_bytealign (w[13], w[14], offset); + w[18] = amd_bytealign (w[12], w[13], offset); + w[17] = amd_bytealign (w[11], w[12], offset); + w[16] = amd_bytealign (w[10], w[11], offset); + w[15] = amd_bytealign (w[ 9], w[10], offset); + w[14] = amd_bytealign (w[ 8], w[ 9], offset); + w[13] = amd_bytealign (w[ 7], w[ 8], offset); + w[12] = amd_bytealign (w[ 6], w[ 7], offset); + w[11] = amd_bytealign (w[ 5], w[ 6], offset); + w[10] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = amd_bytealign (w[56], w[57], offset); + w[62] = amd_bytealign (w[55], w[56], offset); + w[61] = amd_bytealign (w[54], w[55], offset); + w[60] = amd_bytealign (w[53], w[54], offset); + w[59] = amd_bytealign (w[52], w[53], offset); + w[58] = amd_bytealign (w[51], w[52], offset); + w[57] = amd_bytealign (w[50], w[51], offset); + w[56] = amd_bytealign (w[49], w[50], offset); + w[55] = amd_bytealign (w[48], w[49], offset); + w[54] = amd_bytealign (w[47], w[48], offset); + w[53] = amd_bytealign (w[46], w[47], offset); + w[52] = amd_bytealign (w[45], w[46], offset); + w[51] = amd_bytealign (w[44], w[45], offset); + w[50] = amd_bytealign (w[43], w[44], offset); + w[49] = amd_bytealign (w[42], w[43], offset); + w[48] = amd_bytealign (w[41], w[42], offset); + w[47] = amd_bytealign (w[40], w[41], offset); + w[46] = amd_bytealign (w[39], w[40], offset); + w[45] = amd_bytealign (w[38], w[39], offset); + w[44] = amd_bytealign (w[37], w[38], offset); + w[43] = amd_bytealign (w[36], w[37], offset); + w[42] = amd_bytealign (w[35], w[36], offset); + w[41] = amd_bytealign (w[34], w[35], offset); + w[40] = amd_bytealign (w[33], w[34], offset); + w[39] = amd_bytealign (w[32], w[33], offset); + w[38] = amd_bytealign (w[31], w[32], offset); + w[37] = amd_bytealign (w[30], w[31], offset); + w[36] = amd_bytealign (w[29], w[30], offset); + w[35] = amd_bytealign (w[28], w[29], offset); + w[34] = amd_bytealign (w[27], w[28], offset); + w[33] = amd_bytealign (w[26], w[27], offset); + w[32] = amd_bytealign (w[25], w[26], offset); + w[31] = amd_bytealign (w[24], w[25], offset); + w[30] = amd_bytealign (w[23], w[24], offset); + w[29] = amd_bytealign (w[22], w[23], offset); + w[28] = amd_bytealign (w[21], w[22], offset); + w[27] = amd_bytealign (w[20], w[21], offset); + w[26] = amd_bytealign (w[19], w[20], offset); + w[25] = amd_bytealign (w[18], w[19], offset); + w[24] = amd_bytealign (w[17], w[18], offset); + w[23] = amd_bytealign (w[16], w[17], offset); + w[22] = amd_bytealign (w[15], w[16], offset); + w[21] = amd_bytealign (w[14], w[15], offset); + w[20] = amd_bytealign (w[13], w[14], offset); + w[19] = amd_bytealign (w[12], w[13], offset); + w[18] = amd_bytealign (w[11], w[12], offset); + w[17] = amd_bytealign (w[10], w[11], offset); + w[16] = amd_bytealign (w[ 9], w[10], offset); + w[15] = amd_bytealign (w[ 8], w[ 9], offset); + w[14] = amd_bytealign (w[ 7], w[ 8], offset); + w[13] = amd_bytealign (w[ 6], w[ 7], offset); + w[12] = amd_bytealign (w[ 5], w[ 6], offset); + w[11] = amd_bytealign (w[ 4], w[ 5], offset); + w[10] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = amd_bytealign (w[55], w[56], offset); + w[62] = amd_bytealign (w[54], w[55], offset); + w[61] = amd_bytealign (w[53], w[54], offset); + w[60] = amd_bytealign (w[52], w[53], offset); + w[59] = amd_bytealign (w[51], w[52], offset); + w[58] = amd_bytealign (w[50], w[51], offset); + w[57] = amd_bytealign (w[49], w[50], offset); + w[56] = amd_bytealign (w[48], w[49], offset); + w[55] = amd_bytealign (w[47], w[48], offset); + w[54] = amd_bytealign (w[46], w[47], offset); + w[53] = amd_bytealign (w[45], w[46], offset); + w[52] = amd_bytealign (w[44], w[45], offset); + w[51] = amd_bytealign (w[43], w[44], offset); + w[50] = amd_bytealign (w[42], w[43], offset); + w[49] = amd_bytealign (w[41], w[42], offset); + w[48] = amd_bytealign (w[40], w[41], offset); + w[47] = amd_bytealign (w[39], w[40], offset); + w[46] = amd_bytealign (w[38], w[39], offset); + w[45] = amd_bytealign (w[37], w[38], offset); + w[44] = amd_bytealign (w[36], w[37], offset); + w[43] = amd_bytealign (w[35], w[36], offset); + w[42] = amd_bytealign (w[34], w[35], offset); + w[41] = amd_bytealign (w[33], w[34], offset); + w[40] = amd_bytealign (w[32], w[33], offset); + w[39] = amd_bytealign (w[31], w[32], offset); + w[38] = amd_bytealign (w[30], w[31], offset); + w[37] = amd_bytealign (w[29], w[30], offset); + w[36] = amd_bytealign (w[28], w[29], offset); + w[35] = amd_bytealign (w[27], w[28], offset); + w[34] = amd_bytealign (w[26], w[27], offset); + w[33] = amd_bytealign (w[25], w[26], offset); + w[32] = amd_bytealign (w[24], w[25], offset); + w[31] = amd_bytealign (w[23], w[24], offset); + w[30] = amd_bytealign (w[22], w[23], offset); + w[29] = amd_bytealign (w[21], w[22], offset); + w[28] = amd_bytealign (w[20], w[21], offset); + w[27] = amd_bytealign (w[19], w[20], offset); + w[26] = amd_bytealign (w[18], w[19], offset); + w[25] = amd_bytealign (w[17], w[18], offset); + w[24] = amd_bytealign (w[16], w[17], offset); + w[23] = amd_bytealign (w[15], w[16], offset); + w[22] = amd_bytealign (w[14], w[15], offset); + w[21] = amd_bytealign (w[13], w[14], offset); + w[20] = amd_bytealign (w[12], w[13], offset); + w[19] = amd_bytealign (w[11], w[12], offset); + w[18] = amd_bytealign (w[10], w[11], offset); + w[17] = amd_bytealign (w[ 9], w[10], offset); + w[16] = amd_bytealign (w[ 8], w[ 9], offset); + w[15] = amd_bytealign (w[ 7], w[ 8], offset); + w[14] = amd_bytealign (w[ 6], w[ 7], offset); + w[13] = amd_bytealign (w[ 5], w[ 6], offset); + w[12] = amd_bytealign (w[ 4], w[ 5], offset); + w[11] = amd_bytealign (w[ 3], w[ 4], offset); + w[10] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = amd_bytealign (w[54], w[55], offset); + w[62] = amd_bytealign (w[53], w[54], offset); + w[61] = amd_bytealign (w[52], w[53], offset); + w[60] = amd_bytealign (w[51], w[52], offset); + w[59] = amd_bytealign (w[50], w[51], offset); + w[58] = amd_bytealign (w[49], w[50], offset); + w[57] = amd_bytealign (w[48], w[49], offset); + w[56] = amd_bytealign (w[47], w[48], offset); + w[55] = amd_bytealign (w[46], w[47], offset); + w[54] = amd_bytealign (w[45], w[46], offset); + w[53] = amd_bytealign (w[44], w[45], offset); + w[52] = amd_bytealign (w[43], w[44], offset); + w[51] = amd_bytealign (w[42], w[43], offset); + w[50] = amd_bytealign (w[41], w[42], offset); + w[49] = amd_bytealign (w[40], w[41], offset); + w[48] = amd_bytealign (w[39], w[40], offset); + w[47] = amd_bytealign (w[38], w[39], offset); + w[46] = amd_bytealign (w[37], w[38], offset); + w[45] = amd_bytealign (w[36], w[37], offset); + w[44] = amd_bytealign (w[35], w[36], offset); + w[43] = amd_bytealign (w[34], w[35], offset); + w[42] = amd_bytealign (w[33], w[34], offset); + w[41] = amd_bytealign (w[32], w[33], offset); + w[40] = amd_bytealign (w[31], w[32], offset); + w[39] = amd_bytealign (w[30], w[31], offset); + w[38] = amd_bytealign (w[29], w[30], offset); + w[37] = amd_bytealign (w[28], w[29], offset); + w[36] = amd_bytealign (w[27], w[28], offset); + w[35] = amd_bytealign (w[26], w[27], offset); + w[34] = amd_bytealign (w[25], w[26], offset); + w[33] = amd_bytealign (w[24], w[25], offset); + w[32] = amd_bytealign (w[23], w[24], offset); + w[31] = amd_bytealign (w[22], w[23], offset); + w[30] = amd_bytealign (w[21], w[22], offset); + w[29] = amd_bytealign (w[20], w[21], offset); + w[28] = amd_bytealign (w[19], w[20], offset); + w[27] = amd_bytealign (w[18], w[19], offset); + w[26] = amd_bytealign (w[17], w[18], offset); + w[25] = amd_bytealign (w[16], w[17], offset); + w[24] = amd_bytealign (w[15], w[16], offset); + w[23] = amd_bytealign (w[14], w[15], offset); + w[22] = amd_bytealign (w[13], w[14], offset); + w[21] = amd_bytealign (w[12], w[13], offset); + w[20] = amd_bytealign (w[11], w[12], offset); + w[19] = amd_bytealign (w[10], w[11], offset); + w[18] = amd_bytealign (w[ 9], w[10], offset); + w[17] = amd_bytealign (w[ 8], w[ 9], offset); + w[16] = amd_bytealign (w[ 7], w[ 8], offset); + w[15] = amd_bytealign (w[ 6], w[ 7], offset); + w[14] = amd_bytealign (w[ 5], w[ 6], offset); + w[13] = amd_bytealign (w[ 4], w[ 5], offset); + w[12] = amd_bytealign (w[ 3], w[ 4], offset); + w[11] = amd_bytealign (w[ 2], w[ 3], offset); + w[10] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = amd_bytealign (w[53], w[54], offset); + w[62] = amd_bytealign (w[52], w[53], offset); + w[61] = amd_bytealign (w[51], w[52], offset); + w[60] = amd_bytealign (w[50], w[51], offset); + w[59] = amd_bytealign (w[49], w[50], offset); + w[58] = amd_bytealign (w[48], w[49], offset); + w[57] = amd_bytealign (w[47], w[48], offset); + w[56] = amd_bytealign (w[46], w[47], offset); + w[55] = amd_bytealign (w[45], w[46], offset); + w[54] = amd_bytealign (w[44], w[45], offset); + w[53] = amd_bytealign (w[43], w[44], offset); + w[52] = amd_bytealign (w[42], w[43], offset); + w[51] = amd_bytealign (w[41], w[42], offset); + w[50] = amd_bytealign (w[40], w[41], offset); + w[49] = amd_bytealign (w[39], w[40], offset); + w[48] = amd_bytealign (w[38], w[39], offset); + w[47] = amd_bytealign (w[37], w[38], offset); + w[46] = amd_bytealign (w[36], w[37], offset); + w[45] = amd_bytealign (w[35], w[36], offset); + w[44] = amd_bytealign (w[34], w[35], offset); + w[43] = amd_bytealign (w[33], w[34], offset); + w[42] = amd_bytealign (w[32], w[33], offset); + w[41] = amd_bytealign (w[31], w[32], offset); + w[40] = amd_bytealign (w[30], w[31], offset); + w[39] = amd_bytealign (w[29], w[30], offset); + w[38] = amd_bytealign (w[28], w[29], offset); + w[37] = amd_bytealign (w[27], w[28], offset); + w[36] = amd_bytealign (w[26], w[27], offset); + w[35] = amd_bytealign (w[25], w[26], offset); + w[34] = amd_bytealign (w[24], w[25], offset); + w[33] = amd_bytealign (w[23], w[24], offset); + w[32] = amd_bytealign (w[22], w[23], offset); + w[31] = amd_bytealign (w[21], w[22], offset); + w[30] = amd_bytealign (w[20], w[21], offset); + w[29] = amd_bytealign (w[19], w[20], offset); + w[28] = amd_bytealign (w[18], w[19], offset); + w[27] = amd_bytealign (w[17], w[18], offset); + w[26] = amd_bytealign (w[16], w[17], offset); + w[25] = amd_bytealign (w[15], w[16], offset); + w[24] = amd_bytealign (w[14], w[15], offset); + w[23] = amd_bytealign (w[13], w[14], offset); + w[22] = amd_bytealign (w[12], w[13], offset); + w[21] = amd_bytealign (w[11], w[12], offset); + w[20] = amd_bytealign (w[10], w[11], offset); + w[19] = amd_bytealign (w[ 9], w[10], offset); + w[18] = amd_bytealign (w[ 8], w[ 9], offset); + w[17] = amd_bytealign (w[ 7], w[ 8], offset); + w[16] = amd_bytealign (w[ 6], w[ 7], offset); + w[15] = amd_bytealign (w[ 5], w[ 6], offset); + w[14] = amd_bytealign (w[ 4], w[ 5], offset); + w[13] = amd_bytealign (w[ 3], w[ 4], offset); + w[12] = amd_bytealign (w[ 2], w[ 3], offset); + w[11] = amd_bytealign (w[ 1], w[ 2], offset); + w[10] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = amd_bytealign (w[52], w[53], offset); + w[62] = amd_bytealign (w[51], w[52], offset); + w[61] = amd_bytealign (w[50], w[51], offset); + w[60] = amd_bytealign (w[49], w[50], offset); + w[59] = amd_bytealign (w[48], w[49], offset); + w[58] = amd_bytealign (w[47], w[48], offset); + w[57] = amd_bytealign (w[46], w[47], offset); + w[56] = amd_bytealign (w[45], w[46], offset); + w[55] = amd_bytealign (w[44], w[45], offset); + w[54] = amd_bytealign (w[43], w[44], offset); + w[53] = amd_bytealign (w[42], w[43], offset); + w[52] = amd_bytealign (w[41], w[42], offset); + w[51] = amd_bytealign (w[40], w[41], offset); + w[50] = amd_bytealign (w[39], w[40], offset); + w[49] = amd_bytealign (w[38], w[39], offset); + w[48] = amd_bytealign (w[37], w[38], offset); + w[47] = amd_bytealign (w[36], w[37], offset); + w[46] = amd_bytealign (w[35], w[36], offset); + w[45] = amd_bytealign (w[34], w[35], offset); + w[44] = amd_bytealign (w[33], w[34], offset); + w[43] = amd_bytealign (w[32], w[33], offset); + w[42] = amd_bytealign (w[31], w[32], offset); + w[41] = amd_bytealign (w[30], w[31], offset); + w[40] = amd_bytealign (w[29], w[30], offset); + w[39] = amd_bytealign (w[28], w[29], offset); + w[38] = amd_bytealign (w[27], w[28], offset); + w[37] = amd_bytealign (w[26], w[27], offset); + w[36] = amd_bytealign (w[25], w[26], offset); + w[35] = amd_bytealign (w[24], w[25], offset); + w[34] = amd_bytealign (w[23], w[24], offset); + w[33] = amd_bytealign (w[22], w[23], offset); + w[32] = amd_bytealign (w[21], w[22], offset); + w[31] = amd_bytealign (w[20], w[21], offset); + w[30] = amd_bytealign (w[19], w[20], offset); + w[29] = amd_bytealign (w[18], w[19], offset); + w[28] = amd_bytealign (w[17], w[18], offset); + w[27] = amd_bytealign (w[16], w[17], offset); + w[26] = amd_bytealign (w[15], w[16], offset); + w[25] = amd_bytealign (w[14], w[15], offset); + w[24] = amd_bytealign (w[13], w[14], offset); + w[23] = amd_bytealign (w[12], w[13], offset); + w[22] = amd_bytealign (w[11], w[12], offset); + w[21] = amd_bytealign (w[10], w[11], offset); + w[20] = amd_bytealign (w[ 9], w[10], offset); + w[19] = amd_bytealign (w[ 8], w[ 9], offset); + w[18] = amd_bytealign (w[ 7], w[ 8], offset); + w[17] = amd_bytealign (w[ 6], w[ 7], offset); + w[16] = amd_bytealign (w[ 5], w[ 6], offset); + w[15] = amd_bytealign (w[ 4], w[ 5], offset); + w[14] = amd_bytealign (w[ 3], w[ 4], offset); + w[13] = amd_bytealign (w[ 2], w[ 3], offset); + w[12] = amd_bytealign (w[ 1], w[ 2], offset); + w[11] = amd_bytealign (w[ 0], w[ 1], offset); + w[10] = amd_bytealign ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = amd_bytealign (w[51], w[52], offset); + w[62] = amd_bytealign (w[50], w[51], offset); + w[61] = amd_bytealign (w[49], w[50], offset); + w[60] = amd_bytealign (w[48], w[49], offset); + w[59] = amd_bytealign (w[47], w[48], offset); + w[58] = amd_bytealign (w[46], w[47], offset); + w[57] = amd_bytealign (w[45], w[46], offset); + w[56] = amd_bytealign (w[44], w[45], offset); + w[55] = amd_bytealign (w[43], w[44], offset); + w[54] = amd_bytealign (w[42], w[43], offset); + w[53] = amd_bytealign (w[41], w[42], offset); + w[52] = amd_bytealign (w[40], w[41], offset); + w[51] = amd_bytealign (w[39], w[40], offset); + w[50] = amd_bytealign (w[38], w[39], offset); + w[49] = amd_bytealign (w[37], w[38], offset); + w[48] = amd_bytealign (w[36], w[37], offset); + w[47] = amd_bytealign (w[35], w[36], offset); + w[46] = amd_bytealign (w[34], w[35], offset); + w[45] = amd_bytealign (w[33], w[34], offset); + w[44] = amd_bytealign (w[32], w[33], offset); + w[43] = amd_bytealign (w[31], w[32], offset); + w[42] = amd_bytealign (w[30], w[31], offset); + w[41] = amd_bytealign (w[29], w[30], offset); + w[40] = amd_bytealign (w[28], w[29], offset); + w[39] = amd_bytealign (w[27], w[28], offset); + w[38] = amd_bytealign (w[26], w[27], offset); + w[37] = amd_bytealign (w[25], w[26], offset); + w[36] = amd_bytealign (w[24], w[25], offset); + w[35] = amd_bytealign (w[23], w[24], offset); + w[34] = amd_bytealign (w[22], w[23], offset); + w[33] = amd_bytealign (w[21], w[22], offset); + w[32] = amd_bytealign (w[20], w[21], offset); + w[31] = amd_bytealign (w[19], w[20], offset); + w[30] = amd_bytealign (w[18], w[19], offset); + w[29] = amd_bytealign (w[17], w[18], offset); + w[28] = amd_bytealign (w[16], w[17], offset); + w[27] = amd_bytealign (w[15], w[16], offset); + w[26] = amd_bytealign (w[14], w[15], offset); + w[25] = amd_bytealign (w[13], w[14], offset); + w[24] = amd_bytealign (w[12], w[13], offset); + w[23] = amd_bytealign (w[11], w[12], offset); + w[22] = amd_bytealign (w[10], w[11], offset); + w[21] = amd_bytealign (w[ 9], w[10], offset); + w[20] = amd_bytealign (w[ 8], w[ 9], offset); + w[19] = amd_bytealign (w[ 7], w[ 8], offset); + w[18] = amd_bytealign (w[ 6], w[ 7], offset); + w[17] = amd_bytealign (w[ 5], w[ 6], offset); + w[16] = amd_bytealign (w[ 4], w[ 5], offset); + w[15] = amd_bytealign (w[ 3], w[ 4], offset); + w[14] = amd_bytealign (w[ 2], w[ 3], offset); + w[13] = amd_bytealign (w[ 1], w[ 2], offset); + w[12] = amd_bytealign (w[ 0], w[ 1], offset); + w[11] = amd_bytealign ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = amd_bytealign (w[50], w[51], offset); + w[62] = amd_bytealign (w[49], w[50], offset); + w[61] = amd_bytealign (w[48], w[49], offset); + w[60] = amd_bytealign (w[47], w[48], offset); + w[59] = amd_bytealign (w[46], w[47], offset); + w[58] = amd_bytealign (w[45], w[46], offset); + w[57] = amd_bytealign (w[44], w[45], offset); + w[56] = amd_bytealign (w[43], w[44], offset); + w[55] = amd_bytealign (w[42], w[43], offset); + w[54] = amd_bytealign (w[41], w[42], offset); + w[53] = amd_bytealign (w[40], w[41], offset); + w[52] = amd_bytealign (w[39], w[40], offset); + w[51] = amd_bytealign (w[38], w[39], offset); + w[50] = amd_bytealign (w[37], w[38], offset); + w[49] = amd_bytealign (w[36], w[37], offset); + w[48] = amd_bytealign (w[35], w[36], offset); + w[47] = amd_bytealign (w[34], w[35], offset); + w[46] = amd_bytealign (w[33], w[34], offset); + w[45] = amd_bytealign (w[32], w[33], offset); + w[44] = amd_bytealign (w[31], w[32], offset); + w[43] = amd_bytealign (w[30], w[31], offset); + w[42] = amd_bytealign (w[29], w[30], offset); + w[41] = amd_bytealign (w[28], w[29], offset); + w[40] = amd_bytealign (w[27], w[28], offset); + w[39] = amd_bytealign (w[26], w[27], offset); + w[38] = amd_bytealign (w[25], w[26], offset); + w[37] = amd_bytealign (w[24], w[25], offset); + w[36] = amd_bytealign (w[23], w[24], offset); + w[35] = amd_bytealign (w[22], w[23], offset); + w[34] = amd_bytealign (w[21], w[22], offset); + w[33] = amd_bytealign (w[20], w[21], offset); + w[32] = amd_bytealign (w[19], w[20], offset); + w[31] = amd_bytealign (w[18], w[19], offset); + w[30] = amd_bytealign (w[17], w[18], offset); + w[29] = amd_bytealign (w[16], w[17], offset); + w[28] = amd_bytealign (w[15], w[16], offset); + w[27] = amd_bytealign (w[14], w[15], offset); + w[26] = amd_bytealign (w[13], w[14], offset); + w[25] = amd_bytealign (w[12], w[13], offset); + w[24] = amd_bytealign (w[11], w[12], offset); + w[23] = amd_bytealign (w[10], w[11], offset); + w[22] = amd_bytealign (w[ 9], w[10], offset); + w[21] = amd_bytealign (w[ 8], w[ 9], offset); + w[20] = amd_bytealign (w[ 7], w[ 8], offset); + w[19] = amd_bytealign (w[ 6], w[ 7], offset); + w[18] = amd_bytealign (w[ 5], w[ 6], offset); + w[17] = amd_bytealign (w[ 4], w[ 5], offset); + w[16] = amd_bytealign (w[ 3], w[ 4], offset); + w[15] = amd_bytealign (w[ 2], w[ 3], offset); + w[14] = amd_bytealign (w[ 1], w[ 2], offset); + w[13] = amd_bytealign (w[ 0], w[ 1], offset); + w[12] = amd_bytealign ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = amd_bytealign (w[49], w[50], offset); + w[62] = amd_bytealign (w[48], w[49], offset); + w[61] = amd_bytealign (w[47], w[48], offset); + w[60] = amd_bytealign (w[46], w[47], offset); + w[59] = amd_bytealign (w[45], w[46], offset); + w[58] = amd_bytealign (w[44], w[45], offset); + w[57] = amd_bytealign (w[43], w[44], offset); + w[56] = amd_bytealign (w[42], w[43], offset); + w[55] = amd_bytealign (w[41], w[42], offset); + w[54] = amd_bytealign (w[40], w[41], offset); + w[53] = amd_bytealign (w[39], w[40], offset); + w[52] = amd_bytealign (w[38], w[39], offset); + w[51] = amd_bytealign (w[37], w[38], offset); + w[50] = amd_bytealign (w[36], w[37], offset); + w[49] = amd_bytealign (w[35], w[36], offset); + w[48] = amd_bytealign (w[34], w[35], offset); + w[47] = amd_bytealign (w[33], w[34], offset); + w[46] = amd_bytealign (w[32], w[33], offset); + w[45] = amd_bytealign (w[31], w[32], offset); + w[44] = amd_bytealign (w[30], w[31], offset); + w[43] = amd_bytealign (w[29], w[30], offset); + w[42] = amd_bytealign (w[28], w[29], offset); + w[41] = amd_bytealign (w[27], w[28], offset); + w[40] = amd_bytealign (w[26], w[27], offset); + w[39] = amd_bytealign (w[25], w[26], offset); + w[38] = amd_bytealign (w[24], w[25], offset); + w[37] = amd_bytealign (w[23], w[24], offset); + w[36] = amd_bytealign (w[22], w[23], offset); + w[35] = amd_bytealign (w[21], w[22], offset); + w[34] = amd_bytealign (w[20], w[21], offset); + w[33] = amd_bytealign (w[19], w[20], offset); + w[32] = amd_bytealign (w[18], w[19], offset); + w[31] = amd_bytealign (w[17], w[18], offset); + w[30] = amd_bytealign (w[16], w[17], offset); + w[29] = amd_bytealign (w[15], w[16], offset); + w[28] = amd_bytealign (w[14], w[15], offset); + w[27] = amd_bytealign (w[13], w[14], offset); + w[26] = amd_bytealign (w[12], w[13], offset); + w[25] = amd_bytealign (w[11], w[12], offset); + w[24] = amd_bytealign (w[10], w[11], offset); + w[23] = amd_bytealign (w[ 9], w[10], offset); + w[22] = amd_bytealign (w[ 8], w[ 9], offset); + w[21] = amd_bytealign (w[ 7], w[ 8], offset); + w[20] = amd_bytealign (w[ 6], w[ 7], offset); + w[19] = amd_bytealign (w[ 5], w[ 6], offset); + w[18] = amd_bytealign (w[ 4], w[ 5], offset); + w[17] = amd_bytealign (w[ 3], w[ 4], offset); + w[16] = amd_bytealign (w[ 2], w[ 3], offset); + w[15] = amd_bytealign (w[ 1], w[ 2], offset); + w[14] = amd_bytealign (w[ 0], w[ 1], offset); + w[13] = amd_bytealign ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = amd_bytealign (w[48], w[49], offset); + w[62] = amd_bytealign (w[47], w[48], offset); + w[61] = amd_bytealign (w[46], w[47], offset); + w[60] = amd_bytealign (w[45], w[46], offset); + w[59] = amd_bytealign (w[44], w[45], offset); + w[58] = amd_bytealign (w[43], w[44], offset); + w[57] = amd_bytealign (w[42], w[43], offset); + w[56] = amd_bytealign (w[41], w[42], offset); + w[55] = amd_bytealign (w[40], w[41], offset); + w[54] = amd_bytealign (w[39], w[40], offset); + w[53] = amd_bytealign (w[38], w[39], offset); + w[52] = amd_bytealign (w[37], w[38], offset); + w[51] = amd_bytealign (w[36], w[37], offset); + w[50] = amd_bytealign (w[35], w[36], offset); + w[49] = amd_bytealign (w[34], w[35], offset); + w[48] = amd_bytealign (w[33], w[34], offset); + w[47] = amd_bytealign (w[32], w[33], offset); + w[46] = amd_bytealign (w[31], w[32], offset); + w[45] = amd_bytealign (w[30], w[31], offset); + w[44] = amd_bytealign (w[29], w[30], offset); + w[43] = amd_bytealign (w[28], w[29], offset); + w[42] = amd_bytealign (w[27], w[28], offset); + w[41] = amd_bytealign (w[26], w[27], offset); + w[40] = amd_bytealign (w[25], w[26], offset); + w[39] = amd_bytealign (w[24], w[25], offset); + w[38] = amd_bytealign (w[23], w[24], offset); + w[37] = amd_bytealign (w[22], w[23], offset); + w[36] = amd_bytealign (w[21], w[22], offset); + w[35] = amd_bytealign (w[20], w[21], offset); + w[34] = amd_bytealign (w[19], w[20], offset); + w[33] = amd_bytealign (w[18], w[19], offset); + w[32] = amd_bytealign (w[17], w[18], offset); + w[31] = amd_bytealign (w[16], w[17], offset); + w[30] = amd_bytealign (w[15], w[16], offset); + w[29] = amd_bytealign (w[14], w[15], offset); + w[28] = amd_bytealign (w[13], w[14], offset); + w[27] = amd_bytealign (w[12], w[13], offset); + w[26] = amd_bytealign (w[11], w[12], offset); + w[25] = amd_bytealign (w[10], w[11], offset); + w[24] = amd_bytealign (w[ 9], w[10], offset); + w[23] = amd_bytealign (w[ 8], w[ 9], offset); + w[22] = amd_bytealign (w[ 7], w[ 8], offset); + w[21] = amd_bytealign (w[ 6], w[ 7], offset); + w[20] = amd_bytealign (w[ 5], w[ 6], offset); + w[19] = amd_bytealign (w[ 4], w[ 5], offset); + w[18] = amd_bytealign (w[ 3], w[ 4], offset); + w[17] = amd_bytealign (w[ 2], w[ 3], offset); + w[16] = amd_bytealign (w[ 1], w[ 2], offset); + w[15] = amd_bytealign (w[ 0], w[ 1], offset); + w[14] = amd_bytealign ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = amd_bytealign (w[47], w[48], offset); + w[62] = amd_bytealign (w[46], w[47], offset); + w[61] = amd_bytealign (w[45], w[46], offset); + w[60] = amd_bytealign (w[44], w[45], offset); + w[59] = amd_bytealign (w[43], w[44], offset); + w[58] = amd_bytealign (w[42], w[43], offset); + w[57] = amd_bytealign (w[41], w[42], offset); + w[56] = amd_bytealign (w[40], w[41], offset); + w[55] = amd_bytealign (w[39], w[40], offset); + w[54] = amd_bytealign (w[38], w[39], offset); + w[53] = amd_bytealign (w[37], w[38], offset); + w[52] = amd_bytealign (w[36], w[37], offset); + w[51] = amd_bytealign (w[35], w[36], offset); + w[50] = amd_bytealign (w[34], w[35], offset); + w[49] = amd_bytealign (w[33], w[34], offset); + w[48] = amd_bytealign (w[32], w[33], offset); + w[47] = amd_bytealign (w[31], w[32], offset); + w[46] = amd_bytealign (w[30], w[31], offset); + w[45] = amd_bytealign (w[29], w[30], offset); + w[44] = amd_bytealign (w[28], w[29], offset); + w[43] = amd_bytealign (w[27], w[28], offset); + w[42] = amd_bytealign (w[26], w[27], offset); + w[41] = amd_bytealign (w[25], w[26], offset); + w[40] = amd_bytealign (w[24], w[25], offset); + w[39] = amd_bytealign (w[23], w[24], offset); + w[38] = amd_bytealign (w[22], w[23], offset); + w[37] = amd_bytealign (w[21], w[22], offset); + w[36] = amd_bytealign (w[20], w[21], offset); + w[35] = amd_bytealign (w[19], w[20], offset); + w[34] = amd_bytealign (w[18], w[19], offset); + w[33] = amd_bytealign (w[17], w[18], offset); + w[32] = amd_bytealign (w[16], w[17], offset); + w[31] = amd_bytealign (w[15], w[16], offset); + w[30] = amd_bytealign (w[14], w[15], offset); + w[29] = amd_bytealign (w[13], w[14], offset); + w[28] = amd_bytealign (w[12], w[13], offset); + w[27] = amd_bytealign (w[11], w[12], offset); + w[26] = amd_bytealign (w[10], w[11], offset); + w[25] = amd_bytealign (w[ 9], w[10], offset); + w[24] = amd_bytealign (w[ 8], w[ 9], offset); + w[23] = amd_bytealign (w[ 7], w[ 8], offset); + w[22] = amd_bytealign (w[ 6], w[ 7], offset); + w[21] = amd_bytealign (w[ 5], w[ 6], offset); + w[20] = amd_bytealign (w[ 4], w[ 5], offset); + w[19] = amd_bytealign (w[ 3], w[ 4], offset); + w[18] = amd_bytealign (w[ 2], w[ 3], offset); + w[17] = amd_bytealign (w[ 1], w[ 2], offset); + w[16] = amd_bytealign (w[ 0], w[ 1], offset); + w[15] = amd_bytealign ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = amd_bytealign (w[46], w[47], offset); + w[62] = amd_bytealign (w[45], w[46], offset); + w[61] = amd_bytealign (w[44], w[45], offset); + w[60] = amd_bytealign (w[43], w[44], offset); + w[59] = amd_bytealign (w[42], w[43], offset); + w[58] = amd_bytealign (w[41], w[42], offset); + w[57] = amd_bytealign (w[40], w[41], offset); + w[56] = amd_bytealign (w[39], w[40], offset); + w[55] = amd_bytealign (w[38], w[39], offset); + w[54] = amd_bytealign (w[37], w[38], offset); + w[53] = amd_bytealign (w[36], w[37], offset); + w[52] = amd_bytealign (w[35], w[36], offset); + w[51] = amd_bytealign (w[34], w[35], offset); + w[50] = amd_bytealign (w[33], w[34], offset); + w[49] = amd_bytealign (w[32], w[33], offset); + w[48] = amd_bytealign (w[31], w[32], offset); + w[47] = amd_bytealign (w[30], w[31], offset); + w[46] = amd_bytealign (w[29], w[30], offset); + w[45] = amd_bytealign (w[28], w[29], offset); + w[44] = amd_bytealign (w[27], w[28], offset); + w[43] = amd_bytealign (w[26], w[27], offset); + w[42] = amd_bytealign (w[25], w[26], offset); + w[41] = amd_bytealign (w[24], w[25], offset); + w[40] = amd_bytealign (w[23], w[24], offset); + w[39] = amd_bytealign (w[22], w[23], offset); + w[38] = amd_bytealign (w[21], w[22], offset); + w[37] = amd_bytealign (w[20], w[21], offset); + w[36] = amd_bytealign (w[19], w[20], offset); + w[35] = amd_bytealign (w[18], w[19], offset); + w[34] = amd_bytealign (w[17], w[18], offset); + w[33] = amd_bytealign (w[16], w[17], offset); + w[32] = amd_bytealign (w[15], w[16], offset); + w[31] = amd_bytealign (w[14], w[15], offset); + w[30] = amd_bytealign (w[13], w[14], offset); + w[29] = amd_bytealign (w[12], w[13], offset); + w[28] = amd_bytealign (w[11], w[12], offset); + w[27] = amd_bytealign (w[10], w[11], offset); + w[26] = amd_bytealign (w[ 9], w[10], offset); + w[25] = amd_bytealign (w[ 8], w[ 9], offset); + w[24] = amd_bytealign (w[ 7], w[ 8], offset); + w[23] = amd_bytealign (w[ 6], w[ 7], offset); + w[22] = amd_bytealign (w[ 5], w[ 6], offset); + w[21] = amd_bytealign (w[ 4], w[ 5], offset); + w[20] = amd_bytealign (w[ 3], w[ 4], offset); + w[19] = amd_bytealign (w[ 2], w[ 3], offset); + w[18] = amd_bytealign (w[ 1], w[ 2], offset); + w[17] = amd_bytealign (w[ 0], w[ 1], offset); + w[16] = amd_bytealign ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = amd_bytealign (w[45], w[46], offset); + w[62] = amd_bytealign (w[44], w[45], offset); + w[61] = amd_bytealign (w[43], w[44], offset); + w[60] = amd_bytealign (w[42], w[43], offset); + w[59] = amd_bytealign (w[41], w[42], offset); + w[58] = amd_bytealign (w[40], w[41], offset); + w[57] = amd_bytealign (w[39], w[40], offset); + w[56] = amd_bytealign (w[38], w[39], offset); + w[55] = amd_bytealign (w[37], w[38], offset); + w[54] = amd_bytealign (w[36], w[37], offset); + w[53] = amd_bytealign (w[35], w[36], offset); + w[52] = amd_bytealign (w[34], w[35], offset); + w[51] = amd_bytealign (w[33], w[34], offset); + w[50] = amd_bytealign (w[32], w[33], offset); + w[49] = amd_bytealign (w[31], w[32], offset); + w[48] = amd_bytealign (w[30], w[31], offset); + w[47] = amd_bytealign (w[29], w[30], offset); + w[46] = amd_bytealign (w[28], w[29], offset); + w[45] = amd_bytealign (w[27], w[28], offset); + w[44] = amd_bytealign (w[26], w[27], offset); + w[43] = amd_bytealign (w[25], w[26], offset); + w[42] = amd_bytealign (w[24], w[25], offset); + w[41] = amd_bytealign (w[23], w[24], offset); + w[40] = amd_bytealign (w[22], w[23], offset); + w[39] = amd_bytealign (w[21], w[22], offset); + w[38] = amd_bytealign (w[20], w[21], offset); + w[37] = amd_bytealign (w[19], w[20], offset); + w[36] = amd_bytealign (w[18], w[19], offset); + w[35] = amd_bytealign (w[17], w[18], offset); + w[34] = amd_bytealign (w[16], w[17], offset); + w[33] = amd_bytealign (w[15], w[16], offset); + w[32] = amd_bytealign (w[14], w[15], offset); + w[31] = amd_bytealign (w[13], w[14], offset); + w[30] = amd_bytealign (w[12], w[13], offset); + w[29] = amd_bytealign (w[11], w[12], offset); + w[28] = amd_bytealign (w[10], w[11], offset); + w[27] = amd_bytealign (w[ 9], w[10], offset); + w[26] = amd_bytealign (w[ 8], w[ 9], offset); + w[25] = amd_bytealign (w[ 7], w[ 8], offset); + w[24] = amd_bytealign (w[ 6], w[ 7], offset); + w[23] = amd_bytealign (w[ 5], w[ 6], offset); + w[22] = amd_bytealign (w[ 4], w[ 5], offset); + w[21] = amd_bytealign (w[ 3], w[ 4], offset); + w[20] = amd_bytealign (w[ 2], w[ 3], offset); + w[19] = amd_bytealign (w[ 1], w[ 2], offset); + w[18] = amd_bytealign (w[ 0], w[ 1], offset); + w[17] = amd_bytealign ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = amd_bytealign (w[44], w[45], offset); + w[62] = amd_bytealign (w[43], w[44], offset); + w[61] = amd_bytealign (w[42], w[43], offset); + w[60] = amd_bytealign (w[41], w[42], offset); + w[59] = amd_bytealign (w[40], w[41], offset); + w[58] = amd_bytealign (w[39], w[40], offset); + w[57] = amd_bytealign (w[38], w[39], offset); + w[56] = amd_bytealign (w[37], w[38], offset); + w[55] = amd_bytealign (w[36], w[37], offset); + w[54] = amd_bytealign (w[35], w[36], offset); + w[53] = amd_bytealign (w[34], w[35], offset); + w[52] = amd_bytealign (w[33], w[34], offset); + w[51] = amd_bytealign (w[32], w[33], offset); + w[50] = amd_bytealign (w[31], w[32], offset); + w[49] = amd_bytealign (w[30], w[31], offset); + w[48] = amd_bytealign (w[29], w[30], offset); + w[47] = amd_bytealign (w[28], w[29], offset); + w[46] = amd_bytealign (w[27], w[28], offset); + w[45] = amd_bytealign (w[26], w[27], offset); + w[44] = amd_bytealign (w[25], w[26], offset); + w[43] = amd_bytealign (w[24], w[25], offset); + w[42] = amd_bytealign (w[23], w[24], offset); + w[41] = amd_bytealign (w[22], w[23], offset); + w[40] = amd_bytealign (w[21], w[22], offset); + w[39] = amd_bytealign (w[20], w[21], offset); + w[38] = amd_bytealign (w[19], w[20], offset); + w[37] = amd_bytealign (w[18], w[19], offset); + w[36] = amd_bytealign (w[17], w[18], offset); + w[35] = amd_bytealign (w[16], w[17], offset); + w[34] = amd_bytealign (w[15], w[16], offset); + w[33] = amd_bytealign (w[14], w[15], offset); + w[32] = amd_bytealign (w[13], w[14], offset); + w[31] = amd_bytealign (w[12], w[13], offset); + w[30] = amd_bytealign (w[11], w[12], offset); + w[29] = amd_bytealign (w[10], w[11], offset); + w[28] = amd_bytealign (w[ 9], w[10], offset); + w[27] = amd_bytealign (w[ 8], w[ 9], offset); + w[26] = amd_bytealign (w[ 7], w[ 8], offset); + w[25] = amd_bytealign (w[ 6], w[ 7], offset); + w[24] = amd_bytealign (w[ 5], w[ 6], offset); + w[23] = amd_bytealign (w[ 4], w[ 5], offset); + w[22] = amd_bytealign (w[ 3], w[ 4], offset); + w[21] = amd_bytealign (w[ 2], w[ 3], offset); + w[20] = amd_bytealign (w[ 1], w[ 2], offset); + w[19] = amd_bytealign (w[ 0], w[ 1], offset); + w[18] = amd_bytealign ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = amd_bytealign (w[43], w[44], offset); + w[62] = amd_bytealign (w[42], w[43], offset); + w[61] = amd_bytealign (w[41], w[42], offset); + w[60] = amd_bytealign (w[40], w[41], offset); + w[59] = amd_bytealign (w[39], w[40], offset); + w[58] = amd_bytealign (w[38], w[39], offset); + w[57] = amd_bytealign (w[37], w[38], offset); + w[56] = amd_bytealign (w[36], w[37], offset); + w[55] = amd_bytealign (w[35], w[36], offset); + w[54] = amd_bytealign (w[34], w[35], offset); + w[53] = amd_bytealign (w[33], w[34], offset); + w[52] = amd_bytealign (w[32], w[33], offset); + w[51] = amd_bytealign (w[31], w[32], offset); + w[50] = amd_bytealign (w[30], w[31], offset); + w[49] = amd_bytealign (w[29], w[30], offset); + w[48] = amd_bytealign (w[28], w[29], offset); + w[47] = amd_bytealign (w[27], w[28], offset); + w[46] = amd_bytealign (w[26], w[27], offset); + w[45] = amd_bytealign (w[25], w[26], offset); + w[44] = amd_bytealign (w[24], w[25], offset); + w[43] = amd_bytealign (w[23], w[24], offset); + w[42] = amd_bytealign (w[22], w[23], offset); + w[41] = amd_bytealign (w[21], w[22], offset); + w[40] = amd_bytealign (w[20], w[21], offset); + w[39] = amd_bytealign (w[19], w[20], offset); + w[38] = amd_bytealign (w[18], w[19], offset); + w[37] = amd_bytealign (w[17], w[18], offset); + w[36] = amd_bytealign (w[16], w[17], offset); + w[35] = amd_bytealign (w[15], w[16], offset); + w[34] = amd_bytealign (w[14], w[15], offset); + w[33] = amd_bytealign (w[13], w[14], offset); + w[32] = amd_bytealign (w[12], w[13], offset); + w[31] = amd_bytealign (w[11], w[12], offset); + w[30] = amd_bytealign (w[10], w[11], offset); + w[29] = amd_bytealign (w[ 9], w[10], offset); + w[28] = amd_bytealign (w[ 8], w[ 9], offset); + w[27] = amd_bytealign (w[ 7], w[ 8], offset); + w[26] = amd_bytealign (w[ 6], w[ 7], offset); + w[25] = amd_bytealign (w[ 5], w[ 6], offset); + w[24] = amd_bytealign (w[ 4], w[ 5], offset); + w[23] = amd_bytealign (w[ 3], w[ 4], offset); + w[22] = amd_bytealign (w[ 2], w[ 3], offset); + w[21] = amd_bytealign (w[ 1], w[ 2], offset); + w[20] = amd_bytealign (w[ 0], w[ 1], offset); + w[19] = amd_bytealign ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = amd_bytealign (w[42], w[43], offset); + w[62] = amd_bytealign (w[41], w[42], offset); + w[61] = amd_bytealign (w[40], w[41], offset); + w[60] = amd_bytealign (w[39], w[40], offset); + w[59] = amd_bytealign (w[38], w[39], offset); + w[58] = amd_bytealign (w[37], w[38], offset); + w[57] = amd_bytealign (w[36], w[37], offset); + w[56] = amd_bytealign (w[35], w[36], offset); + w[55] = amd_bytealign (w[34], w[35], offset); + w[54] = amd_bytealign (w[33], w[34], offset); + w[53] = amd_bytealign (w[32], w[33], offset); + w[52] = amd_bytealign (w[31], w[32], offset); + w[51] = amd_bytealign (w[30], w[31], offset); + w[50] = amd_bytealign (w[29], w[30], offset); + w[49] = amd_bytealign (w[28], w[29], offset); + w[48] = amd_bytealign (w[27], w[28], offset); + w[47] = amd_bytealign (w[26], w[27], offset); + w[46] = amd_bytealign (w[25], w[26], offset); + w[45] = amd_bytealign (w[24], w[25], offset); + w[44] = amd_bytealign (w[23], w[24], offset); + w[43] = amd_bytealign (w[22], w[23], offset); + w[42] = amd_bytealign (w[21], w[22], offset); + w[41] = amd_bytealign (w[20], w[21], offset); + w[40] = amd_bytealign (w[19], w[20], offset); + w[39] = amd_bytealign (w[18], w[19], offset); + w[38] = amd_bytealign (w[17], w[18], offset); + w[37] = amd_bytealign (w[16], w[17], offset); + w[36] = amd_bytealign (w[15], w[16], offset); + w[35] = amd_bytealign (w[14], w[15], offset); + w[34] = amd_bytealign (w[13], w[14], offset); + w[33] = amd_bytealign (w[12], w[13], offset); + w[32] = amd_bytealign (w[11], w[12], offset); + w[31] = amd_bytealign (w[10], w[11], offset); + w[30] = amd_bytealign (w[ 9], w[10], offset); + w[29] = amd_bytealign (w[ 8], w[ 9], offset); + w[28] = amd_bytealign (w[ 7], w[ 8], offset); + w[27] = amd_bytealign (w[ 6], w[ 7], offset); + w[26] = amd_bytealign (w[ 5], w[ 6], offset); + w[25] = amd_bytealign (w[ 4], w[ 5], offset); + w[24] = amd_bytealign (w[ 3], w[ 4], offset); + w[23] = amd_bytealign (w[ 2], w[ 3], offset); + w[22] = amd_bytealign (w[ 1], w[ 2], offset); + w[21] = amd_bytealign (w[ 0], w[ 1], offset); + w[20] = amd_bytealign ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = amd_bytealign (w[41], w[42], offset); + w[62] = amd_bytealign (w[40], w[41], offset); + w[61] = amd_bytealign (w[39], w[40], offset); + w[60] = amd_bytealign (w[38], w[39], offset); + w[59] = amd_bytealign (w[37], w[38], offset); + w[58] = amd_bytealign (w[36], w[37], offset); + w[57] = amd_bytealign (w[35], w[36], offset); + w[56] = amd_bytealign (w[34], w[35], offset); + w[55] = amd_bytealign (w[33], w[34], offset); + w[54] = amd_bytealign (w[32], w[33], offset); + w[53] = amd_bytealign (w[31], w[32], offset); + w[52] = amd_bytealign (w[30], w[31], offset); + w[51] = amd_bytealign (w[29], w[30], offset); + w[50] = amd_bytealign (w[28], w[29], offset); + w[49] = amd_bytealign (w[27], w[28], offset); + w[48] = amd_bytealign (w[26], w[27], offset); + w[47] = amd_bytealign (w[25], w[26], offset); + w[46] = amd_bytealign (w[24], w[25], offset); + w[45] = amd_bytealign (w[23], w[24], offset); + w[44] = amd_bytealign (w[22], w[23], offset); + w[43] = amd_bytealign (w[21], w[22], offset); + w[42] = amd_bytealign (w[20], w[21], offset); + w[41] = amd_bytealign (w[19], w[20], offset); + w[40] = amd_bytealign (w[18], w[19], offset); + w[39] = amd_bytealign (w[17], w[18], offset); + w[38] = amd_bytealign (w[16], w[17], offset); + w[37] = amd_bytealign (w[15], w[16], offset); + w[36] = amd_bytealign (w[14], w[15], offset); + w[35] = amd_bytealign (w[13], w[14], offset); + w[34] = amd_bytealign (w[12], w[13], offset); + w[33] = amd_bytealign (w[11], w[12], offset); + w[32] = amd_bytealign (w[10], w[11], offset); + w[31] = amd_bytealign (w[ 9], w[10], offset); + w[30] = amd_bytealign (w[ 8], w[ 9], offset); + w[29] = amd_bytealign (w[ 7], w[ 8], offset); + w[28] = amd_bytealign (w[ 6], w[ 7], offset); + w[27] = amd_bytealign (w[ 5], w[ 6], offset); + w[26] = amd_bytealign (w[ 4], w[ 5], offset); + w[25] = amd_bytealign (w[ 3], w[ 4], offset); + w[24] = amd_bytealign (w[ 2], w[ 3], offset); + w[23] = amd_bytealign (w[ 1], w[ 2], offset); + w[22] = amd_bytealign (w[ 0], w[ 1], offset); + w[21] = amd_bytealign ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = amd_bytealign (w[40], w[41], offset); + w[62] = amd_bytealign (w[39], w[40], offset); + w[61] = amd_bytealign (w[38], w[39], offset); + w[60] = amd_bytealign (w[37], w[38], offset); + w[59] = amd_bytealign (w[36], w[37], offset); + w[58] = amd_bytealign (w[35], w[36], offset); + w[57] = amd_bytealign (w[34], w[35], offset); + w[56] = amd_bytealign (w[33], w[34], offset); + w[55] = amd_bytealign (w[32], w[33], offset); + w[54] = amd_bytealign (w[31], w[32], offset); + w[53] = amd_bytealign (w[30], w[31], offset); + w[52] = amd_bytealign (w[29], w[30], offset); + w[51] = amd_bytealign (w[28], w[29], offset); + w[50] = amd_bytealign (w[27], w[28], offset); + w[49] = amd_bytealign (w[26], w[27], offset); + w[48] = amd_bytealign (w[25], w[26], offset); + w[47] = amd_bytealign (w[24], w[25], offset); + w[46] = amd_bytealign (w[23], w[24], offset); + w[45] = amd_bytealign (w[22], w[23], offset); + w[44] = amd_bytealign (w[21], w[22], offset); + w[43] = amd_bytealign (w[20], w[21], offset); + w[42] = amd_bytealign (w[19], w[20], offset); + w[41] = amd_bytealign (w[18], w[19], offset); + w[40] = amd_bytealign (w[17], w[18], offset); + w[39] = amd_bytealign (w[16], w[17], offset); + w[38] = amd_bytealign (w[15], w[16], offset); + w[37] = amd_bytealign (w[14], w[15], offset); + w[36] = amd_bytealign (w[13], w[14], offset); + w[35] = amd_bytealign (w[12], w[13], offset); + w[34] = amd_bytealign (w[11], w[12], offset); + w[33] = amd_bytealign (w[10], w[11], offset); + w[32] = amd_bytealign (w[ 9], w[10], offset); + w[31] = amd_bytealign (w[ 8], w[ 9], offset); + w[30] = amd_bytealign (w[ 7], w[ 8], offset); + w[29] = amd_bytealign (w[ 6], w[ 7], offset); + w[28] = amd_bytealign (w[ 5], w[ 6], offset); + w[27] = amd_bytealign (w[ 4], w[ 5], offset); + w[26] = amd_bytealign (w[ 3], w[ 4], offset); + w[25] = amd_bytealign (w[ 2], w[ 3], offset); + w[24] = amd_bytealign (w[ 1], w[ 2], offset); + w[23] = amd_bytealign (w[ 0], w[ 1], offset); + w[22] = amd_bytealign ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = amd_bytealign (w[39], w[40], offset); + w[62] = amd_bytealign (w[38], w[39], offset); + w[61] = amd_bytealign (w[37], w[38], offset); + w[60] = amd_bytealign (w[36], w[37], offset); + w[59] = amd_bytealign (w[35], w[36], offset); + w[58] = amd_bytealign (w[34], w[35], offset); + w[57] = amd_bytealign (w[33], w[34], offset); + w[56] = amd_bytealign (w[32], w[33], offset); + w[55] = amd_bytealign (w[31], w[32], offset); + w[54] = amd_bytealign (w[30], w[31], offset); + w[53] = amd_bytealign (w[29], w[30], offset); + w[52] = amd_bytealign (w[28], w[29], offset); + w[51] = amd_bytealign (w[27], w[28], offset); + w[50] = amd_bytealign (w[26], w[27], offset); + w[49] = amd_bytealign (w[25], w[26], offset); + w[48] = amd_bytealign (w[24], w[25], offset); + w[47] = amd_bytealign (w[23], w[24], offset); + w[46] = amd_bytealign (w[22], w[23], offset); + w[45] = amd_bytealign (w[21], w[22], offset); + w[44] = amd_bytealign (w[20], w[21], offset); + w[43] = amd_bytealign (w[19], w[20], offset); + w[42] = amd_bytealign (w[18], w[19], offset); + w[41] = amd_bytealign (w[17], w[18], offset); + w[40] = amd_bytealign (w[16], w[17], offset); + w[39] = amd_bytealign (w[15], w[16], offset); + w[38] = amd_bytealign (w[14], w[15], offset); + w[37] = amd_bytealign (w[13], w[14], offset); + w[36] = amd_bytealign (w[12], w[13], offset); + w[35] = amd_bytealign (w[11], w[12], offset); + w[34] = amd_bytealign (w[10], w[11], offset); + w[33] = amd_bytealign (w[ 9], w[10], offset); + w[32] = amd_bytealign (w[ 8], w[ 9], offset); + w[31] = amd_bytealign (w[ 7], w[ 8], offset); + w[30] = amd_bytealign (w[ 6], w[ 7], offset); + w[29] = amd_bytealign (w[ 5], w[ 6], offset); + w[28] = amd_bytealign (w[ 4], w[ 5], offset); + w[27] = amd_bytealign (w[ 3], w[ 4], offset); + w[26] = amd_bytealign (w[ 2], w[ 3], offset); + w[25] = amd_bytealign (w[ 1], w[ 2], offset); + w[24] = amd_bytealign (w[ 0], w[ 1], offset); + w[23] = amd_bytealign ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = amd_bytealign (w[38], w[39], offset); + w[62] = amd_bytealign (w[37], w[38], offset); + w[61] = amd_bytealign (w[36], w[37], offset); + w[60] = amd_bytealign (w[35], w[36], offset); + w[59] = amd_bytealign (w[34], w[35], offset); + w[58] = amd_bytealign (w[33], w[34], offset); + w[57] = amd_bytealign (w[32], w[33], offset); + w[56] = amd_bytealign (w[31], w[32], offset); + w[55] = amd_bytealign (w[30], w[31], offset); + w[54] = amd_bytealign (w[29], w[30], offset); + w[53] = amd_bytealign (w[28], w[29], offset); + w[52] = amd_bytealign (w[27], w[28], offset); + w[51] = amd_bytealign (w[26], w[27], offset); + w[50] = amd_bytealign (w[25], w[26], offset); + w[49] = amd_bytealign (w[24], w[25], offset); + w[48] = amd_bytealign (w[23], w[24], offset); + w[47] = amd_bytealign (w[22], w[23], offset); + w[46] = amd_bytealign (w[21], w[22], offset); + w[45] = amd_bytealign (w[20], w[21], offset); + w[44] = amd_bytealign (w[19], w[20], offset); + w[43] = amd_bytealign (w[18], w[19], offset); + w[42] = amd_bytealign (w[17], w[18], offset); + w[41] = amd_bytealign (w[16], w[17], offset); + w[40] = amd_bytealign (w[15], w[16], offset); + w[39] = amd_bytealign (w[14], w[15], offset); + w[38] = amd_bytealign (w[13], w[14], offset); + w[37] = amd_bytealign (w[12], w[13], offset); + w[36] = amd_bytealign (w[11], w[12], offset); + w[35] = amd_bytealign (w[10], w[11], offset); + w[34] = amd_bytealign (w[ 9], w[10], offset); + w[33] = amd_bytealign (w[ 8], w[ 9], offset); + w[32] = amd_bytealign (w[ 7], w[ 8], offset); + w[31] = amd_bytealign (w[ 6], w[ 7], offset); + w[30] = amd_bytealign (w[ 5], w[ 6], offset); + w[29] = amd_bytealign (w[ 4], w[ 5], offset); + w[28] = amd_bytealign (w[ 3], w[ 4], offset); + w[27] = amd_bytealign (w[ 2], w[ 3], offset); + w[26] = amd_bytealign (w[ 1], w[ 2], offset); + w[25] = amd_bytealign (w[ 0], w[ 1], offset); + w[24] = amd_bytealign ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = amd_bytealign (w[37], w[38], offset); + w[62] = amd_bytealign (w[36], w[37], offset); + w[61] = amd_bytealign (w[35], w[36], offset); + w[60] = amd_bytealign (w[34], w[35], offset); + w[59] = amd_bytealign (w[33], w[34], offset); + w[58] = amd_bytealign (w[32], w[33], offset); + w[57] = amd_bytealign (w[31], w[32], offset); + w[56] = amd_bytealign (w[30], w[31], offset); + w[55] = amd_bytealign (w[29], w[30], offset); + w[54] = amd_bytealign (w[28], w[29], offset); + w[53] = amd_bytealign (w[27], w[28], offset); + w[52] = amd_bytealign (w[26], w[27], offset); + w[51] = amd_bytealign (w[25], w[26], offset); + w[50] = amd_bytealign (w[24], w[25], offset); + w[49] = amd_bytealign (w[23], w[24], offset); + w[48] = amd_bytealign (w[22], w[23], offset); + w[47] = amd_bytealign (w[21], w[22], offset); + w[46] = amd_bytealign (w[20], w[21], offset); + w[45] = amd_bytealign (w[19], w[20], offset); + w[44] = amd_bytealign (w[18], w[19], offset); + w[43] = amd_bytealign (w[17], w[18], offset); + w[42] = amd_bytealign (w[16], w[17], offset); + w[41] = amd_bytealign (w[15], w[16], offset); + w[40] = amd_bytealign (w[14], w[15], offset); + w[39] = amd_bytealign (w[13], w[14], offset); + w[38] = amd_bytealign (w[12], w[13], offset); + w[37] = amd_bytealign (w[11], w[12], offset); + w[36] = amd_bytealign (w[10], w[11], offset); + w[35] = amd_bytealign (w[ 9], w[10], offset); + w[34] = amd_bytealign (w[ 8], w[ 9], offset); + w[33] = amd_bytealign (w[ 7], w[ 8], offset); + w[32] = amd_bytealign (w[ 6], w[ 7], offset); + w[31] = amd_bytealign (w[ 5], w[ 6], offset); + w[30] = amd_bytealign (w[ 4], w[ 5], offset); + w[29] = amd_bytealign (w[ 3], w[ 4], offset); + w[28] = amd_bytealign (w[ 2], w[ 3], offset); + w[27] = amd_bytealign (w[ 1], w[ 2], offset); + w[26] = amd_bytealign (w[ 0], w[ 1], offset); + w[25] = amd_bytealign ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = amd_bytealign (w[36], w[37], offset); + w[62] = amd_bytealign (w[35], w[36], offset); + w[61] = amd_bytealign (w[34], w[35], offset); + w[60] = amd_bytealign (w[33], w[34], offset); + w[59] = amd_bytealign (w[32], w[33], offset); + w[58] = amd_bytealign (w[31], w[32], offset); + w[57] = amd_bytealign (w[30], w[31], offset); + w[56] = amd_bytealign (w[29], w[30], offset); + w[55] = amd_bytealign (w[28], w[29], offset); + w[54] = amd_bytealign (w[27], w[28], offset); + w[53] = amd_bytealign (w[26], w[27], offset); + w[52] = amd_bytealign (w[25], w[26], offset); + w[51] = amd_bytealign (w[24], w[25], offset); + w[50] = amd_bytealign (w[23], w[24], offset); + w[49] = amd_bytealign (w[22], w[23], offset); + w[48] = amd_bytealign (w[21], w[22], offset); + w[47] = amd_bytealign (w[20], w[21], offset); + w[46] = amd_bytealign (w[19], w[20], offset); + w[45] = amd_bytealign (w[18], w[19], offset); + w[44] = amd_bytealign (w[17], w[18], offset); + w[43] = amd_bytealign (w[16], w[17], offset); + w[42] = amd_bytealign (w[15], w[16], offset); + w[41] = amd_bytealign (w[14], w[15], offset); + w[40] = amd_bytealign (w[13], w[14], offset); + w[39] = amd_bytealign (w[12], w[13], offset); + w[38] = amd_bytealign (w[11], w[12], offset); + w[37] = amd_bytealign (w[10], w[11], offset); + w[36] = amd_bytealign (w[ 9], w[10], offset); + w[35] = amd_bytealign (w[ 8], w[ 9], offset); + w[34] = amd_bytealign (w[ 7], w[ 8], offset); + w[33] = amd_bytealign (w[ 6], w[ 7], offset); + w[32] = amd_bytealign (w[ 5], w[ 6], offset); + w[31] = amd_bytealign (w[ 4], w[ 5], offset); + w[30] = amd_bytealign (w[ 3], w[ 4], offset); + w[29] = amd_bytealign (w[ 2], w[ 3], offset); + w[28] = amd_bytealign (w[ 1], w[ 2], offset); + w[27] = amd_bytealign (w[ 0], w[ 1], offset); + w[26] = amd_bytealign ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = amd_bytealign (w[35], w[36], offset); + w[62] = amd_bytealign (w[34], w[35], offset); + w[61] = amd_bytealign (w[33], w[34], offset); + w[60] = amd_bytealign (w[32], w[33], offset); + w[59] = amd_bytealign (w[31], w[32], offset); + w[58] = amd_bytealign (w[30], w[31], offset); + w[57] = amd_bytealign (w[29], w[30], offset); + w[56] = amd_bytealign (w[28], w[29], offset); + w[55] = amd_bytealign (w[27], w[28], offset); + w[54] = amd_bytealign (w[26], w[27], offset); + w[53] = amd_bytealign (w[25], w[26], offset); + w[52] = amd_bytealign (w[24], w[25], offset); + w[51] = amd_bytealign (w[23], w[24], offset); + w[50] = amd_bytealign (w[22], w[23], offset); + w[49] = amd_bytealign (w[21], w[22], offset); + w[48] = amd_bytealign (w[20], w[21], offset); + w[47] = amd_bytealign (w[19], w[20], offset); + w[46] = amd_bytealign (w[18], w[19], offset); + w[45] = amd_bytealign (w[17], w[18], offset); + w[44] = amd_bytealign (w[16], w[17], offset); + w[43] = amd_bytealign (w[15], w[16], offset); + w[42] = amd_bytealign (w[14], w[15], offset); + w[41] = amd_bytealign (w[13], w[14], offset); + w[40] = amd_bytealign (w[12], w[13], offset); + w[39] = amd_bytealign (w[11], w[12], offset); + w[38] = amd_bytealign (w[10], w[11], offset); + w[37] = amd_bytealign (w[ 9], w[10], offset); + w[36] = amd_bytealign (w[ 8], w[ 9], offset); + w[35] = amd_bytealign (w[ 7], w[ 8], offset); + w[34] = amd_bytealign (w[ 6], w[ 7], offset); + w[33] = amd_bytealign (w[ 5], w[ 6], offset); + w[32] = amd_bytealign (w[ 4], w[ 5], offset); + w[31] = amd_bytealign (w[ 3], w[ 4], offset); + w[30] = amd_bytealign (w[ 2], w[ 3], offset); + w[29] = amd_bytealign (w[ 1], w[ 2], offset); + w[28] = amd_bytealign (w[ 0], w[ 1], offset); + w[27] = amd_bytealign ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = amd_bytealign (w[34], w[35], offset); + w[62] = amd_bytealign (w[33], w[34], offset); + w[61] = amd_bytealign (w[32], w[33], offset); + w[60] = amd_bytealign (w[31], w[32], offset); + w[59] = amd_bytealign (w[30], w[31], offset); + w[58] = amd_bytealign (w[29], w[30], offset); + w[57] = amd_bytealign (w[28], w[29], offset); + w[56] = amd_bytealign (w[27], w[28], offset); + w[55] = amd_bytealign (w[26], w[27], offset); + w[54] = amd_bytealign (w[25], w[26], offset); + w[53] = amd_bytealign (w[24], w[25], offset); + w[52] = amd_bytealign (w[23], w[24], offset); + w[51] = amd_bytealign (w[22], w[23], offset); + w[50] = amd_bytealign (w[21], w[22], offset); + w[49] = amd_bytealign (w[20], w[21], offset); + w[48] = amd_bytealign (w[19], w[20], offset); + w[47] = amd_bytealign (w[18], w[19], offset); + w[46] = amd_bytealign (w[17], w[18], offset); + w[45] = amd_bytealign (w[16], w[17], offset); + w[44] = amd_bytealign (w[15], w[16], offset); + w[43] = amd_bytealign (w[14], w[15], offset); + w[42] = amd_bytealign (w[13], w[14], offset); + w[41] = amd_bytealign (w[12], w[13], offset); + w[40] = amd_bytealign (w[11], w[12], offset); + w[39] = amd_bytealign (w[10], w[11], offset); + w[38] = amd_bytealign (w[ 9], w[10], offset); + w[37] = amd_bytealign (w[ 8], w[ 9], offset); + w[36] = amd_bytealign (w[ 7], w[ 8], offset); + w[35] = amd_bytealign (w[ 6], w[ 7], offset); + w[34] = amd_bytealign (w[ 5], w[ 6], offset); + w[33] = amd_bytealign (w[ 4], w[ 5], offset); + w[32] = amd_bytealign (w[ 3], w[ 4], offset); + w[31] = amd_bytealign (w[ 2], w[ 3], offset); + w[30] = amd_bytealign (w[ 1], w[ 2], offset); + w[29] = amd_bytealign (w[ 0], w[ 1], offset); + w[28] = amd_bytealign ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = amd_bytealign (w[33], w[34], offset); + w[62] = amd_bytealign (w[32], w[33], offset); + w[61] = amd_bytealign (w[31], w[32], offset); + w[60] = amd_bytealign (w[30], w[31], offset); + w[59] = amd_bytealign (w[29], w[30], offset); + w[58] = amd_bytealign (w[28], w[29], offset); + w[57] = amd_bytealign (w[27], w[28], offset); + w[56] = amd_bytealign (w[26], w[27], offset); + w[55] = amd_bytealign (w[25], w[26], offset); + w[54] = amd_bytealign (w[24], w[25], offset); + w[53] = amd_bytealign (w[23], w[24], offset); + w[52] = amd_bytealign (w[22], w[23], offset); + w[51] = amd_bytealign (w[21], w[22], offset); + w[50] = amd_bytealign (w[20], w[21], offset); + w[49] = amd_bytealign (w[19], w[20], offset); + w[48] = amd_bytealign (w[18], w[19], offset); + w[47] = amd_bytealign (w[17], w[18], offset); + w[46] = amd_bytealign (w[16], w[17], offset); + w[45] = amd_bytealign (w[15], w[16], offset); + w[44] = amd_bytealign (w[14], w[15], offset); + w[43] = amd_bytealign (w[13], w[14], offset); + w[42] = amd_bytealign (w[12], w[13], offset); + w[41] = amd_bytealign (w[11], w[12], offset); + w[40] = amd_bytealign (w[10], w[11], offset); + w[39] = amd_bytealign (w[ 9], w[10], offset); + w[38] = amd_bytealign (w[ 8], w[ 9], offset); + w[37] = amd_bytealign (w[ 7], w[ 8], offset); + w[36] = amd_bytealign (w[ 6], w[ 7], offset); + w[35] = amd_bytealign (w[ 5], w[ 6], offset); + w[34] = amd_bytealign (w[ 4], w[ 5], offset); + w[33] = amd_bytealign (w[ 3], w[ 4], offset); + w[32] = amd_bytealign (w[ 2], w[ 3], offset); + w[31] = amd_bytealign (w[ 1], w[ 2], offset); + w[30] = amd_bytealign (w[ 0], w[ 1], offset); + w[29] = amd_bytealign ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = amd_bytealign (w[32], w[33], offset); + w[62] = amd_bytealign (w[31], w[32], offset); + w[61] = amd_bytealign (w[30], w[31], offset); + w[60] = amd_bytealign (w[29], w[30], offset); + w[59] = amd_bytealign (w[28], w[29], offset); + w[58] = amd_bytealign (w[27], w[28], offset); + w[57] = amd_bytealign (w[26], w[27], offset); + w[56] = amd_bytealign (w[25], w[26], offset); + w[55] = amd_bytealign (w[24], w[25], offset); + w[54] = amd_bytealign (w[23], w[24], offset); + w[53] = amd_bytealign (w[22], w[23], offset); + w[52] = amd_bytealign (w[21], w[22], offset); + w[51] = amd_bytealign (w[20], w[21], offset); + w[50] = amd_bytealign (w[19], w[20], offset); + w[49] = amd_bytealign (w[18], w[19], offset); + w[48] = amd_bytealign (w[17], w[18], offset); + w[47] = amd_bytealign (w[16], w[17], offset); + w[46] = amd_bytealign (w[15], w[16], offset); + w[45] = amd_bytealign (w[14], w[15], offset); + w[44] = amd_bytealign (w[13], w[14], offset); + w[43] = amd_bytealign (w[12], w[13], offset); + w[42] = amd_bytealign (w[11], w[12], offset); + w[41] = amd_bytealign (w[10], w[11], offset); + w[40] = amd_bytealign (w[ 9], w[10], offset); + w[39] = amd_bytealign (w[ 8], w[ 9], offset); + w[38] = amd_bytealign (w[ 7], w[ 8], offset); + w[37] = amd_bytealign (w[ 6], w[ 7], offset); + w[36] = amd_bytealign (w[ 5], w[ 6], offset); + w[35] = amd_bytealign (w[ 4], w[ 5], offset); + w[34] = amd_bytealign (w[ 3], w[ 4], offset); + w[33] = amd_bytealign (w[ 2], w[ 3], offset); + w[32] = amd_bytealign (w[ 1], w[ 2], offset); + w[31] = amd_bytealign (w[ 0], w[ 1], offset); + w[30] = amd_bytealign ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = amd_bytealign (w[31], w[32], offset); + w[62] = amd_bytealign (w[30], w[31], offset); + w[61] = amd_bytealign (w[29], w[30], offset); + w[60] = amd_bytealign (w[28], w[29], offset); + w[59] = amd_bytealign (w[27], w[28], offset); + w[58] = amd_bytealign (w[26], w[27], offset); + w[57] = amd_bytealign (w[25], w[26], offset); + w[56] = amd_bytealign (w[24], w[25], offset); + w[55] = amd_bytealign (w[23], w[24], offset); + w[54] = amd_bytealign (w[22], w[23], offset); + w[53] = amd_bytealign (w[21], w[22], offset); + w[52] = amd_bytealign (w[20], w[21], offset); + w[51] = amd_bytealign (w[19], w[20], offset); + w[50] = amd_bytealign (w[18], w[19], offset); + w[49] = amd_bytealign (w[17], w[18], offset); + w[48] = amd_bytealign (w[16], w[17], offset); + w[47] = amd_bytealign (w[15], w[16], offset); + w[46] = amd_bytealign (w[14], w[15], offset); + w[45] = amd_bytealign (w[13], w[14], offset); + w[44] = amd_bytealign (w[12], w[13], offset); + w[43] = amd_bytealign (w[11], w[12], offset); + w[42] = amd_bytealign (w[10], w[11], offset); + w[41] = amd_bytealign (w[ 9], w[10], offset); + w[40] = amd_bytealign (w[ 8], w[ 9], offset); + w[39] = amd_bytealign (w[ 7], w[ 8], offset); + w[38] = amd_bytealign (w[ 6], w[ 7], offset); + w[37] = amd_bytealign (w[ 5], w[ 6], offset); + w[36] = amd_bytealign (w[ 4], w[ 5], offset); + w[35] = amd_bytealign (w[ 3], w[ 4], offset); + w[34] = amd_bytealign (w[ 2], w[ 3], offset); + w[33] = amd_bytealign (w[ 1], w[ 2], offset); + w[32] = amd_bytealign (w[ 0], w[ 1], offset); + w[31] = amd_bytealign ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = amd_bytealign (w[30], w[31], offset); + w[62] = amd_bytealign (w[29], w[30], offset); + w[61] = amd_bytealign (w[28], w[29], offset); + w[60] = amd_bytealign (w[27], w[28], offset); + w[59] = amd_bytealign (w[26], w[27], offset); + w[58] = amd_bytealign (w[25], w[26], offset); + w[57] = amd_bytealign (w[24], w[25], offset); + w[56] = amd_bytealign (w[23], w[24], offset); + w[55] = amd_bytealign (w[22], w[23], offset); + w[54] = amd_bytealign (w[21], w[22], offset); + w[53] = amd_bytealign (w[20], w[21], offset); + w[52] = amd_bytealign (w[19], w[20], offset); + w[51] = amd_bytealign (w[18], w[19], offset); + w[50] = amd_bytealign (w[17], w[18], offset); + w[49] = amd_bytealign (w[16], w[17], offset); + w[48] = amd_bytealign (w[15], w[16], offset); + w[47] = amd_bytealign (w[14], w[15], offset); + w[46] = amd_bytealign (w[13], w[14], offset); + w[45] = amd_bytealign (w[12], w[13], offset); + w[44] = amd_bytealign (w[11], w[12], offset); + w[43] = amd_bytealign (w[10], w[11], offset); + w[42] = amd_bytealign (w[ 9], w[10], offset); + w[41] = amd_bytealign (w[ 8], w[ 9], offset); + w[40] = amd_bytealign (w[ 7], w[ 8], offset); + w[39] = amd_bytealign (w[ 6], w[ 7], offset); + w[38] = amd_bytealign (w[ 5], w[ 6], offset); + w[37] = amd_bytealign (w[ 4], w[ 5], offset); + w[36] = amd_bytealign (w[ 3], w[ 4], offset); + w[35] = amd_bytealign (w[ 2], w[ 3], offset); + w[34] = amd_bytealign (w[ 1], w[ 2], offset); + w[33] = amd_bytealign (w[ 0], w[ 1], offset); + w[32] = amd_bytealign ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = amd_bytealign (w[29], w[30], offset); + w[62] = amd_bytealign (w[28], w[29], offset); + w[61] = amd_bytealign (w[27], w[28], offset); + w[60] = amd_bytealign (w[26], w[27], offset); + w[59] = amd_bytealign (w[25], w[26], offset); + w[58] = amd_bytealign (w[24], w[25], offset); + w[57] = amd_bytealign (w[23], w[24], offset); + w[56] = amd_bytealign (w[22], w[23], offset); + w[55] = amd_bytealign (w[21], w[22], offset); + w[54] = amd_bytealign (w[20], w[21], offset); + w[53] = amd_bytealign (w[19], w[20], offset); + w[52] = amd_bytealign (w[18], w[19], offset); + w[51] = amd_bytealign (w[17], w[18], offset); + w[50] = amd_bytealign (w[16], w[17], offset); + w[49] = amd_bytealign (w[15], w[16], offset); + w[48] = amd_bytealign (w[14], w[15], offset); + w[47] = amd_bytealign (w[13], w[14], offset); + w[46] = amd_bytealign (w[12], w[13], offset); + w[45] = amd_bytealign (w[11], w[12], offset); + w[44] = amd_bytealign (w[10], w[11], offset); + w[43] = amd_bytealign (w[ 9], w[10], offset); + w[42] = amd_bytealign (w[ 8], w[ 9], offset); + w[41] = amd_bytealign (w[ 7], w[ 8], offset); + w[40] = amd_bytealign (w[ 6], w[ 7], offset); + w[39] = amd_bytealign (w[ 5], w[ 6], offset); + w[38] = amd_bytealign (w[ 4], w[ 5], offset); + w[37] = amd_bytealign (w[ 3], w[ 4], offset); + w[36] = amd_bytealign (w[ 2], w[ 3], offset); + w[35] = amd_bytealign (w[ 1], w[ 2], offset); + w[34] = amd_bytealign (w[ 0], w[ 1], offset); + w[33] = amd_bytealign ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = amd_bytealign (w[28], w[29], offset); + w[62] = amd_bytealign (w[27], w[28], offset); + w[61] = amd_bytealign (w[26], w[27], offset); + w[60] = amd_bytealign (w[25], w[26], offset); + w[59] = amd_bytealign (w[24], w[25], offset); + w[58] = amd_bytealign (w[23], w[24], offset); + w[57] = amd_bytealign (w[22], w[23], offset); + w[56] = amd_bytealign (w[21], w[22], offset); + w[55] = amd_bytealign (w[20], w[21], offset); + w[54] = amd_bytealign (w[19], w[20], offset); + w[53] = amd_bytealign (w[18], w[19], offset); + w[52] = amd_bytealign (w[17], w[18], offset); + w[51] = amd_bytealign (w[16], w[17], offset); + w[50] = amd_bytealign (w[15], w[16], offset); + w[49] = amd_bytealign (w[14], w[15], offset); + w[48] = amd_bytealign (w[13], w[14], offset); + w[47] = amd_bytealign (w[12], w[13], offset); + w[46] = amd_bytealign (w[11], w[12], offset); + w[45] = amd_bytealign (w[10], w[11], offset); + w[44] = amd_bytealign (w[ 9], w[10], offset); + w[43] = amd_bytealign (w[ 8], w[ 9], offset); + w[42] = amd_bytealign (w[ 7], w[ 8], offset); + w[41] = amd_bytealign (w[ 6], w[ 7], offset); + w[40] = amd_bytealign (w[ 5], w[ 6], offset); + w[39] = amd_bytealign (w[ 4], w[ 5], offset); + w[38] = amd_bytealign (w[ 3], w[ 4], offset); + w[37] = amd_bytealign (w[ 2], w[ 3], offset); + w[36] = amd_bytealign (w[ 1], w[ 2], offset); + w[35] = amd_bytealign (w[ 0], w[ 1], offset); + w[34] = amd_bytealign ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = amd_bytealign (w[27], w[28], offset); + w[62] = amd_bytealign (w[26], w[27], offset); + w[61] = amd_bytealign (w[25], w[26], offset); + w[60] = amd_bytealign (w[24], w[25], offset); + w[59] = amd_bytealign (w[23], w[24], offset); + w[58] = amd_bytealign (w[22], w[23], offset); + w[57] = amd_bytealign (w[21], w[22], offset); + w[56] = amd_bytealign (w[20], w[21], offset); + w[55] = amd_bytealign (w[19], w[20], offset); + w[54] = amd_bytealign (w[18], w[19], offset); + w[53] = amd_bytealign (w[17], w[18], offset); + w[52] = amd_bytealign (w[16], w[17], offset); + w[51] = amd_bytealign (w[15], w[16], offset); + w[50] = amd_bytealign (w[14], w[15], offset); + w[49] = amd_bytealign (w[13], w[14], offset); + w[48] = amd_bytealign (w[12], w[13], offset); + w[47] = amd_bytealign (w[11], w[12], offset); + w[46] = amd_bytealign (w[10], w[11], offset); + w[45] = amd_bytealign (w[ 9], w[10], offset); + w[44] = amd_bytealign (w[ 8], w[ 9], offset); + w[43] = amd_bytealign (w[ 7], w[ 8], offset); + w[42] = amd_bytealign (w[ 6], w[ 7], offset); + w[41] = amd_bytealign (w[ 5], w[ 6], offset); + w[40] = amd_bytealign (w[ 4], w[ 5], offset); + w[39] = amd_bytealign (w[ 3], w[ 4], offset); + w[38] = amd_bytealign (w[ 2], w[ 3], offset); + w[37] = amd_bytealign (w[ 1], w[ 2], offset); + w[36] = amd_bytealign (w[ 0], w[ 1], offset); + w[35] = amd_bytealign ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = amd_bytealign (w[26], w[27], offset); + w[62] = amd_bytealign (w[25], w[26], offset); + w[61] = amd_bytealign (w[24], w[25], offset); + w[60] = amd_bytealign (w[23], w[24], offset); + w[59] = amd_bytealign (w[22], w[23], offset); + w[58] = amd_bytealign (w[21], w[22], offset); + w[57] = amd_bytealign (w[20], w[21], offset); + w[56] = amd_bytealign (w[19], w[20], offset); + w[55] = amd_bytealign (w[18], w[19], offset); + w[54] = amd_bytealign (w[17], w[18], offset); + w[53] = amd_bytealign (w[16], w[17], offset); + w[52] = amd_bytealign (w[15], w[16], offset); + w[51] = amd_bytealign (w[14], w[15], offset); + w[50] = amd_bytealign (w[13], w[14], offset); + w[49] = amd_bytealign (w[12], w[13], offset); + w[48] = amd_bytealign (w[11], w[12], offset); + w[47] = amd_bytealign (w[10], w[11], offset); + w[46] = amd_bytealign (w[ 9], w[10], offset); + w[45] = amd_bytealign (w[ 8], w[ 9], offset); + w[44] = amd_bytealign (w[ 7], w[ 8], offset); + w[43] = amd_bytealign (w[ 6], w[ 7], offset); + w[42] = amd_bytealign (w[ 5], w[ 6], offset); + w[41] = amd_bytealign (w[ 4], w[ 5], offset); + w[40] = amd_bytealign (w[ 3], w[ 4], offset); + w[39] = amd_bytealign (w[ 2], w[ 3], offset); + w[38] = amd_bytealign (w[ 1], w[ 2], offset); + w[37] = amd_bytealign (w[ 0], w[ 1], offset); + w[36] = amd_bytealign ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = amd_bytealign (w[25], w[26], offset); + w[62] = amd_bytealign (w[24], w[25], offset); + w[61] = amd_bytealign (w[23], w[24], offset); + w[60] = amd_bytealign (w[22], w[23], offset); + w[59] = amd_bytealign (w[21], w[22], offset); + w[58] = amd_bytealign (w[20], w[21], offset); + w[57] = amd_bytealign (w[19], w[20], offset); + w[56] = amd_bytealign (w[18], w[19], offset); + w[55] = amd_bytealign (w[17], w[18], offset); + w[54] = amd_bytealign (w[16], w[17], offset); + w[53] = amd_bytealign (w[15], w[16], offset); + w[52] = amd_bytealign (w[14], w[15], offset); + w[51] = amd_bytealign (w[13], w[14], offset); + w[50] = amd_bytealign (w[12], w[13], offset); + w[49] = amd_bytealign (w[11], w[12], offset); + w[48] = amd_bytealign (w[10], w[11], offset); + w[47] = amd_bytealign (w[ 9], w[10], offset); + w[46] = amd_bytealign (w[ 8], w[ 9], offset); + w[45] = amd_bytealign (w[ 7], w[ 8], offset); + w[44] = amd_bytealign (w[ 6], w[ 7], offset); + w[43] = amd_bytealign (w[ 5], w[ 6], offset); + w[42] = amd_bytealign (w[ 4], w[ 5], offset); + w[41] = amd_bytealign (w[ 3], w[ 4], offset); + w[40] = amd_bytealign (w[ 2], w[ 3], offset); + w[39] = amd_bytealign (w[ 1], w[ 2], offset); + w[38] = amd_bytealign (w[ 0], w[ 1], offset); + w[37] = amd_bytealign ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = amd_bytealign (w[24], w[25], offset); + w[62] = amd_bytealign (w[23], w[24], offset); + w[61] = amd_bytealign (w[22], w[23], offset); + w[60] = amd_bytealign (w[21], w[22], offset); + w[59] = amd_bytealign (w[20], w[21], offset); + w[58] = amd_bytealign (w[19], w[20], offset); + w[57] = amd_bytealign (w[18], w[19], offset); + w[56] = amd_bytealign (w[17], w[18], offset); + w[55] = amd_bytealign (w[16], w[17], offset); + w[54] = amd_bytealign (w[15], w[16], offset); + w[53] = amd_bytealign (w[14], w[15], offset); + w[52] = amd_bytealign (w[13], w[14], offset); + w[51] = amd_bytealign (w[12], w[13], offset); + w[50] = amd_bytealign (w[11], w[12], offset); + w[49] = amd_bytealign (w[10], w[11], offset); + w[48] = amd_bytealign (w[ 9], w[10], offset); + w[47] = amd_bytealign (w[ 8], w[ 9], offset); + w[46] = amd_bytealign (w[ 7], w[ 8], offset); + w[45] = amd_bytealign (w[ 6], w[ 7], offset); + w[44] = amd_bytealign (w[ 5], w[ 6], offset); + w[43] = amd_bytealign (w[ 4], w[ 5], offset); + w[42] = amd_bytealign (w[ 3], w[ 4], offset); + w[41] = amd_bytealign (w[ 2], w[ 3], offset); + w[40] = amd_bytealign (w[ 1], w[ 2], offset); + w[39] = amd_bytealign (w[ 0], w[ 1], offset); + w[38] = amd_bytealign ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = amd_bytealign (w[23], w[24], offset); + w[62] = amd_bytealign (w[22], w[23], offset); + w[61] = amd_bytealign (w[21], w[22], offset); + w[60] = amd_bytealign (w[20], w[21], offset); + w[59] = amd_bytealign (w[19], w[20], offset); + w[58] = amd_bytealign (w[18], w[19], offset); + w[57] = amd_bytealign (w[17], w[18], offset); + w[56] = amd_bytealign (w[16], w[17], offset); + w[55] = amd_bytealign (w[15], w[16], offset); + w[54] = amd_bytealign (w[14], w[15], offset); + w[53] = amd_bytealign (w[13], w[14], offset); + w[52] = amd_bytealign (w[12], w[13], offset); + w[51] = amd_bytealign (w[11], w[12], offset); + w[50] = amd_bytealign (w[10], w[11], offset); + w[49] = amd_bytealign (w[ 9], w[10], offset); + w[48] = amd_bytealign (w[ 8], w[ 9], offset); + w[47] = amd_bytealign (w[ 7], w[ 8], offset); + w[46] = amd_bytealign (w[ 6], w[ 7], offset); + w[45] = amd_bytealign (w[ 5], w[ 6], offset); + w[44] = amd_bytealign (w[ 4], w[ 5], offset); + w[43] = amd_bytealign (w[ 3], w[ 4], offset); + w[42] = amd_bytealign (w[ 2], w[ 3], offset); + w[41] = amd_bytealign (w[ 1], w[ 2], offset); + w[40] = amd_bytealign (w[ 0], w[ 1], offset); + w[39] = amd_bytealign ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = amd_bytealign (w[22], w[23], offset); + w[62] = amd_bytealign (w[21], w[22], offset); + w[61] = amd_bytealign (w[20], w[21], offset); + w[60] = amd_bytealign (w[19], w[20], offset); + w[59] = amd_bytealign (w[18], w[19], offset); + w[58] = amd_bytealign (w[17], w[18], offset); + w[57] = amd_bytealign (w[16], w[17], offset); + w[56] = amd_bytealign (w[15], w[16], offset); + w[55] = amd_bytealign (w[14], w[15], offset); + w[54] = amd_bytealign (w[13], w[14], offset); + w[53] = amd_bytealign (w[12], w[13], offset); + w[52] = amd_bytealign (w[11], w[12], offset); + w[51] = amd_bytealign (w[10], w[11], offset); + w[50] = amd_bytealign (w[ 9], w[10], offset); + w[49] = amd_bytealign (w[ 8], w[ 9], offset); + w[48] = amd_bytealign (w[ 7], w[ 8], offset); + w[47] = amd_bytealign (w[ 6], w[ 7], offset); + w[46] = amd_bytealign (w[ 5], w[ 6], offset); + w[45] = amd_bytealign (w[ 4], w[ 5], offset); + w[44] = amd_bytealign (w[ 3], w[ 4], offset); + w[43] = amd_bytealign (w[ 2], w[ 3], offset); + w[42] = amd_bytealign (w[ 1], w[ 2], offset); + w[41] = amd_bytealign (w[ 0], w[ 1], offset); + w[40] = amd_bytealign ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = amd_bytealign (w[21], w[22], offset); + w[62] = amd_bytealign (w[20], w[21], offset); + w[61] = amd_bytealign (w[19], w[20], offset); + w[60] = amd_bytealign (w[18], w[19], offset); + w[59] = amd_bytealign (w[17], w[18], offset); + w[58] = amd_bytealign (w[16], w[17], offset); + w[57] = amd_bytealign (w[15], w[16], offset); + w[56] = amd_bytealign (w[14], w[15], offset); + w[55] = amd_bytealign (w[13], w[14], offset); + w[54] = amd_bytealign (w[12], w[13], offset); + w[53] = amd_bytealign (w[11], w[12], offset); + w[52] = amd_bytealign (w[10], w[11], offset); + w[51] = amd_bytealign (w[ 9], w[10], offset); + w[50] = amd_bytealign (w[ 8], w[ 9], offset); + w[49] = amd_bytealign (w[ 7], w[ 8], offset); + w[48] = amd_bytealign (w[ 6], w[ 7], offset); + w[47] = amd_bytealign (w[ 5], w[ 6], offset); + w[46] = amd_bytealign (w[ 4], w[ 5], offset); + w[45] = amd_bytealign (w[ 3], w[ 4], offset); + w[44] = amd_bytealign (w[ 2], w[ 3], offset); + w[43] = amd_bytealign (w[ 1], w[ 2], offset); + w[42] = amd_bytealign (w[ 0], w[ 1], offset); + w[41] = amd_bytealign ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = amd_bytealign (w[20], w[21], offset); + w[62] = amd_bytealign (w[19], w[20], offset); + w[61] = amd_bytealign (w[18], w[19], offset); + w[60] = amd_bytealign (w[17], w[18], offset); + w[59] = amd_bytealign (w[16], w[17], offset); + w[58] = amd_bytealign (w[15], w[16], offset); + w[57] = amd_bytealign (w[14], w[15], offset); + w[56] = amd_bytealign (w[13], w[14], offset); + w[55] = amd_bytealign (w[12], w[13], offset); + w[54] = amd_bytealign (w[11], w[12], offset); + w[53] = amd_bytealign (w[10], w[11], offset); + w[52] = amd_bytealign (w[ 9], w[10], offset); + w[51] = amd_bytealign (w[ 8], w[ 9], offset); + w[50] = amd_bytealign (w[ 7], w[ 8], offset); + w[49] = amd_bytealign (w[ 6], w[ 7], offset); + w[48] = amd_bytealign (w[ 5], w[ 6], offset); + w[47] = amd_bytealign (w[ 4], w[ 5], offset); + w[46] = amd_bytealign (w[ 3], w[ 4], offset); + w[45] = amd_bytealign (w[ 2], w[ 3], offset); + w[44] = amd_bytealign (w[ 1], w[ 2], offset); + w[43] = amd_bytealign (w[ 0], w[ 1], offset); + w[42] = amd_bytealign ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = amd_bytealign (w[19], w[20], offset); + w[62] = amd_bytealign (w[18], w[19], offset); + w[61] = amd_bytealign (w[17], w[18], offset); + w[60] = amd_bytealign (w[16], w[17], offset); + w[59] = amd_bytealign (w[15], w[16], offset); + w[58] = amd_bytealign (w[14], w[15], offset); + w[57] = amd_bytealign (w[13], w[14], offset); + w[56] = amd_bytealign (w[12], w[13], offset); + w[55] = amd_bytealign (w[11], w[12], offset); + w[54] = amd_bytealign (w[10], w[11], offset); + w[53] = amd_bytealign (w[ 9], w[10], offset); + w[52] = amd_bytealign (w[ 8], w[ 9], offset); + w[51] = amd_bytealign (w[ 7], w[ 8], offset); + w[50] = amd_bytealign (w[ 6], w[ 7], offset); + w[49] = amd_bytealign (w[ 5], w[ 6], offset); + w[48] = amd_bytealign (w[ 4], w[ 5], offset); + w[47] = amd_bytealign (w[ 3], w[ 4], offset); + w[46] = amd_bytealign (w[ 2], w[ 3], offset); + w[45] = amd_bytealign (w[ 1], w[ 2], offset); + w[44] = amd_bytealign (w[ 0], w[ 1], offset); + w[43] = amd_bytealign ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = amd_bytealign (w[18], w[19], offset); + w[62] = amd_bytealign (w[17], w[18], offset); + w[61] = amd_bytealign (w[16], w[17], offset); + w[60] = amd_bytealign (w[15], w[16], offset); + w[59] = amd_bytealign (w[14], w[15], offset); + w[58] = amd_bytealign (w[13], w[14], offset); + w[57] = amd_bytealign (w[12], w[13], offset); + w[56] = amd_bytealign (w[11], w[12], offset); + w[55] = amd_bytealign (w[10], w[11], offset); + w[54] = amd_bytealign (w[ 9], w[10], offset); + w[53] = amd_bytealign (w[ 8], w[ 9], offset); + w[52] = amd_bytealign (w[ 7], w[ 8], offset); + w[51] = amd_bytealign (w[ 6], w[ 7], offset); + w[50] = amd_bytealign (w[ 5], w[ 6], offset); + w[49] = amd_bytealign (w[ 4], w[ 5], offset); + w[48] = amd_bytealign (w[ 3], w[ 4], offset); + w[47] = amd_bytealign (w[ 2], w[ 3], offset); + w[46] = amd_bytealign (w[ 1], w[ 2], offset); + w[45] = amd_bytealign (w[ 0], w[ 1], offset); + w[44] = amd_bytealign ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = amd_bytealign (w[17], w[18], offset); + w[62] = amd_bytealign (w[16], w[17], offset); + w[61] = amd_bytealign (w[15], w[16], offset); + w[60] = amd_bytealign (w[14], w[15], offset); + w[59] = amd_bytealign (w[13], w[14], offset); + w[58] = amd_bytealign (w[12], w[13], offset); + w[57] = amd_bytealign (w[11], w[12], offset); + w[56] = amd_bytealign (w[10], w[11], offset); + w[55] = amd_bytealign (w[ 9], w[10], offset); + w[54] = amd_bytealign (w[ 8], w[ 9], offset); + w[53] = amd_bytealign (w[ 7], w[ 8], offset); + w[52] = amd_bytealign (w[ 6], w[ 7], offset); + w[51] = amd_bytealign (w[ 5], w[ 6], offset); + w[50] = amd_bytealign (w[ 4], w[ 5], offset); + w[49] = amd_bytealign (w[ 3], w[ 4], offset); + w[48] = amd_bytealign (w[ 2], w[ 3], offset); + w[47] = amd_bytealign (w[ 1], w[ 2], offset); + w[46] = amd_bytealign (w[ 0], w[ 1], offset); + w[45] = amd_bytealign ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = amd_bytealign (w[16], w[17], offset); + w[62] = amd_bytealign (w[15], w[16], offset); + w[61] = amd_bytealign (w[14], w[15], offset); + w[60] = amd_bytealign (w[13], w[14], offset); + w[59] = amd_bytealign (w[12], w[13], offset); + w[58] = amd_bytealign (w[11], w[12], offset); + w[57] = amd_bytealign (w[10], w[11], offset); + w[56] = amd_bytealign (w[ 9], w[10], offset); + w[55] = amd_bytealign (w[ 8], w[ 9], offset); + w[54] = amd_bytealign (w[ 7], w[ 8], offset); + w[53] = amd_bytealign (w[ 6], w[ 7], offset); + w[52] = amd_bytealign (w[ 5], w[ 6], offset); + w[51] = amd_bytealign (w[ 4], w[ 5], offset); + w[50] = amd_bytealign (w[ 3], w[ 4], offset); + w[49] = amd_bytealign (w[ 2], w[ 3], offset); + w[48] = amd_bytealign (w[ 1], w[ 2], offset); + w[47] = amd_bytealign (w[ 0], w[ 1], offset); + w[46] = amd_bytealign ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = amd_bytealign (w[15], w[16], offset); + w[62] = amd_bytealign (w[14], w[15], offset); + w[61] = amd_bytealign (w[13], w[14], offset); + w[60] = amd_bytealign (w[12], w[13], offset); + w[59] = amd_bytealign (w[11], w[12], offset); + w[58] = amd_bytealign (w[10], w[11], offset); + w[57] = amd_bytealign (w[ 9], w[10], offset); + w[56] = amd_bytealign (w[ 8], w[ 9], offset); + w[55] = amd_bytealign (w[ 7], w[ 8], offset); + w[54] = amd_bytealign (w[ 6], w[ 7], offset); + w[53] = amd_bytealign (w[ 5], w[ 6], offset); + w[52] = amd_bytealign (w[ 4], w[ 5], offset); + w[51] = amd_bytealign (w[ 3], w[ 4], offset); + w[50] = amd_bytealign (w[ 2], w[ 3], offset); + w[49] = amd_bytealign (w[ 1], w[ 2], offset); + w[48] = amd_bytealign (w[ 0], w[ 1], offset); + w[47] = amd_bytealign ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = amd_bytealign (w[14], w[15], offset); + w[62] = amd_bytealign (w[13], w[14], offset); + w[61] = amd_bytealign (w[12], w[13], offset); + w[60] = amd_bytealign (w[11], w[12], offset); + w[59] = amd_bytealign (w[10], w[11], offset); + w[58] = amd_bytealign (w[ 9], w[10], offset); + w[57] = amd_bytealign (w[ 8], w[ 9], offset); + w[56] = amd_bytealign (w[ 7], w[ 8], offset); + w[55] = amd_bytealign (w[ 6], w[ 7], offset); + w[54] = amd_bytealign (w[ 5], w[ 6], offset); + w[53] = amd_bytealign (w[ 4], w[ 5], offset); + w[52] = amd_bytealign (w[ 3], w[ 4], offset); + w[51] = amd_bytealign (w[ 2], w[ 3], offset); + w[50] = amd_bytealign (w[ 1], w[ 2], offset); + w[49] = amd_bytealign (w[ 0], w[ 1], offset); + w[48] = amd_bytealign ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = amd_bytealign (w[13], w[14], offset); + w[62] = amd_bytealign (w[12], w[13], offset); + w[61] = amd_bytealign (w[11], w[12], offset); + w[60] = amd_bytealign (w[10], w[11], offset); + w[59] = amd_bytealign (w[ 9], w[10], offset); + w[58] = amd_bytealign (w[ 8], w[ 9], offset); + w[57] = amd_bytealign (w[ 7], w[ 8], offset); + w[56] = amd_bytealign (w[ 6], w[ 7], offset); + w[55] = amd_bytealign (w[ 5], w[ 6], offset); + w[54] = amd_bytealign (w[ 4], w[ 5], offset); + w[53] = amd_bytealign (w[ 3], w[ 4], offset); + w[52] = amd_bytealign (w[ 2], w[ 3], offset); + w[51] = amd_bytealign (w[ 1], w[ 2], offset); + w[50] = amd_bytealign (w[ 0], w[ 1], offset); + w[49] = amd_bytealign ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = amd_bytealign (w[12], w[13], offset); + w[62] = amd_bytealign (w[11], w[12], offset); + w[61] = amd_bytealign (w[10], w[11], offset); + w[60] = amd_bytealign (w[ 9], w[10], offset); + w[59] = amd_bytealign (w[ 8], w[ 9], offset); + w[58] = amd_bytealign (w[ 7], w[ 8], offset); + w[57] = amd_bytealign (w[ 6], w[ 7], offset); + w[56] = amd_bytealign (w[ 5], w[ 6], offset); + w[55] = amd_bytealign (w[ 4], w[ 5], offset); + w[54] = amd_bytealign (w[ 3], w[ 4], offset); + w[53] = amd_bytealign (w[ 2], w[ 3], offset); + w[52] = amd_bytealign (w[ 1], w[ 2], offset); + w[51] = amd_bytealign (w[ 0], w[ 1], offset); + w[50] = amd_bytealign ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = amd_bytealign (w[11], w[12], offset); + w[62] = amd_bytealign (w[10], w[11], offset); + w[61] = amd_bytealign (w[ 9], w[10], offset); + w[60] = amd_bytealign (w[ 8], w[ 9], offset); + w[59] = amd_bytealign (w[ 7], w[ 8], offset); + w[58] = amd_bytealign (w[ 6], w[ 7], offset); + w[57] = amd_bytealign (w[ 5], w[ 6], offset); + w[56] = amd_bytealign (w[ 4], w[ 5], offset); + w[55] = amd_bytealign (w[ 3], w[ 4], offset); + w[54] = amd_bytealign (w[ 2], w[ 3], offset); + w[53] = amd_bytealign (w[ 1], w[ 2], offset); + w[52] = amd_bytealign (w[ 0], w[ 1], offset); + w[51] = amd_bytealign ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = amd_bytealign (w[10], w[11], offset); + w[62] = amd_bytealign (w[ 9], w[10], offset); + w[61] = amd_bytealign (w[ 8], w[ 9], offset); + w[60] = amd_bytealign (w[ 7], w[ 8], offset); + w[59] = amd_bytealign (w[ 6], w[ 7], offset); + w[58] = amd_bytealign (w[ 5], w[ 6], offset); + w[57] = amd_bytealign (w[ 4], w[ 5], offset); + w[56] = amd_bytealign (w[ 3], w[ 4], offset); + w[55] = amd_bytealign (w[ 2], w[ 3], offset); + w[54] = amd_bytealign (w[ 1], w[ 2], offset); + w[53] = amd_bytealign (w[ 0], w[ 1], offset); + w[52] = amd_bytealign ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = amd_bytealign (w[ 9], w[10], offset); + w[62] = amd_bytealign (w[ 8], w[ 9], offset); + w[61] = amd_bytealign (w[ 7], w[ 8], offset); + w[60] = amd_bytealign (w[ 6], w[ 7], offset); + w[59] = amd_bytealign (w[ 5], w[ 6], offset); + w[58] = amd_bytealign (w[ 4], w[ 5], offset); + w[57] = amd_bytealign (w[ 3], w[ 4], offset); + w[56] = amd_bytealign (w[ 2], w[ 3], offset); + w[55] = amd_bytealign (w[ 1], w[ 2], offset); + w[54] = amd_bytealign (w[ 0], w[ 1], offset); + w[53] = amd_bytealign ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = amd_bytealign (w[ 8], w[ 9], offset); + w[62] = amd_bytealign (w[ 7], w[ 8], offset); + w[61] = amd_bytealign (w[ 6], w[ 7], offset); + w[60] = amd_bytealign (w[ 5], w[ 6], offset); + w[59] = amd_bytealign (w[ 4], w[ 5], offset); + w[58] = amd_bytealign (w[ 3], w[ 4], offset); + w[57] = amd_bytealign (w[ 2], w[ 3], offset); + w[56] = amd_bytealign (w[ 1], w[ 2], offset); + w[55] = amd_bytealign (w[ 0], w[ 1], offset); + w[54] = amd_bytealign ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = amd_bytealign (w[ 7], w[ 8], offset); + w[62] = amd_bytealign (w[ 6], w[ 7], offset); + w[61] = amd_bytealign (w[ 5], w[ 6], offset); + w[60] = amd_bytealign (w[ 4], w[ 5], offset); + w[59] = amd_bytealign (w[ 3], w[ 4], offset); + w[58] = amd_bytealign (w[ 2], w[ 3], offset); + w[57] = amd_bytealign (w[ 1], w[ 2], offset); + w[56] = amd_bytealign (w[ 0], w[ 1], offset); + w[55] = amd_bytealign ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = amd_bytealign (w[ 6], w[ 7], offset); + w[62] = amd_bytealign (w[ 5], w[ 6], offset); + w[61] = amd_bytealign (w[ 4], w[ 5], offset); + w[60] = amd_bytealign (w[ 3], w[ 4], offset); + w[59] = amd_bytealign (w[ 2], w[ 3], offset); + w[58] = amd_bytealign (w[ 1], w[ 2], offset); + w[57] = amd_bytealign (w[ 0], w[ 1], offset); + w[56] = amd_bytealign ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = amd_bytealign (w[ 5], w[ 6], offset); + w[62] = amd_bytealign (w[ 4], w[ 5], offset); + w[61] = amd_bytealign (w[ 3], w[ 4], offset); + w[60] = amd_bytealign (w[ 2], w[ 3], offset); + w[59] = amd_bytealign (w[ 1], w[ 2], offset); + w[58] = amd_bytealign (w[ 0], w[ 1], offset); + w[57] = amd_bytealign ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = amd_bytealign (w[ 4], w[ 5], offset); + w[62] = amd_bytealign (w[ 3], w[ 4], offset); + w[61] = amd_bytealign (w[ 2], w[ 3], offset); + w[60] = amd_bytealign (w[ 1], w[ 2], offset); + w[59] = amd_bytealign (w[ 0], w[ 1], offset); + w[58] = amd_bytealign ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = amd_bytealign (w[ 3], w[ 4], offset); + w[62] = amd_bytealign (w[ 2], w[ 3], offset); + w[61] = amd_bytealign (w[ 1], w[ 2], offset); + w[60] = amd_bytealign (w[ 0], w[ 1], offset); + w[59] = amd_bytealign ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = amd_bytealign (w[ 2], w[ 3], offset); + w[62] = amd_bytealign (w[ 1], w[ 2], offset); + w[61] = amd_bytealign (w[ 0], w[ 1], offset); + w[60] = amd_bytealign ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = amd_bytealign (w[ 1], w[ 2], offset); + w[62] = amd_bytealign (w[ 0], w[ 1], offset); + w[61] = amd_bytealign ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = amd_bytealign (w[ 0], w[ 1], offset); + w[62] = amd_bytealign ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = amd_bytealign ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); + + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w[63] = __byte_perm (w[62], w[63], selector); + w[62] = __byte_perm (w[61], w[62], selector); + w[61] = __byte_perm (w[60], w[61], selector); + w[60] = __byte_perm (w[59], w[60], selector); + w[59] = __byte_perm (w[58], w[59], selector); + w[58] = __byte_perm (w[57], w[58], selector); + w[57] = __byte_perm (w[56], w[57], selector); + w[56] = __byte_perm (w[55], w[56], selector); + w[55] = __byte_perm (w[54], w[55], selector); + w[54] = __byte_perm (w[53], w[54], selector); + w[53] = __byte_perm (w[52], w[53], selector); + w[52] = __byte_perm (w[51], w[52], selector); + w[51] = __byte_perm (w[50], w[51], selector); + w[50] = __byte_perm (w[49], w[50], selector); + w[49] = __byte_perm (w[48], w[49], selector); + w[48] = __byte_perm (w[47], w[48], selector); + w[47] = __byte_perm (w[46], w[47], selector); + w[46] = __byte_perm (w[45], w[46], selector); + w[45] = __byte_perm (w[44], w[45], selector); + w[44] = __byte_perm (w[43], w[44], selector); + w[43] = __byte_perm (w[42], w[43], selector); + w[42] = __byte_perm (w[41], w[42], selector); + w[41] = __byte_perm (w[40], w[41], selector); + w[40] = __byte_perm (w[39], w[40], selector); + w[39] = __byte_perm (w[38], w[39], selector); + w[38] = __byte_perm (w[37], w[38], selector); + w[37] = __byte_perm (w[36], w[37], selector); + w[36] = __byte_perm (w[35], w[36], selector); + w[35] = __byte_perm (w[34], w[35], selector); + w[34] = __byte_perm (w[33], w[34], selector); + w[33] = __byte_perm (w[32], w[33], selector); + w[32] = __byte_perm (w[31], w[32], selector); + w[31] = __byte_perm (w[30], w[31], selector); + w[30] = __byte_perm (w[29], w[30], selector); + w[29] = __byte_perm (w[28], w[29], selector); + w[28] = __byte_perm (w[27], w[28], selector); + w[27] = __byte_perm (w[26], w[27], selector); + w[26] = __byte_perm (w[25], w[26], selector); + w[25] = __byte_perm (w[24], w[25], selector); + w[24] = __byte_perm (w[23], w[24], selector); + w[23] = __byte_perm (w[22], w[23], selector); + w[22] = __byte_perm (w[21], w[22], selector); + w[21] = __byte_perm (w[20], w[21], selector); + w[20] = __byte_perm (w[19], w[20], selector); + w[19] = __byte_perm (w[18], w[19], selector); + w[18] = __byte_perm (w[17], w[18], selector); + w[17] = __byte_perm (w[16], w[17], selector); + w[16] = __byte_perm (w[15], w[16], selector); + w[15] = __byte_perm (w[14], w[15], selector); + w[14] = __byte_perm (w[13], w[14], selector); + w[13] = __byte_perm (w[12], w[13], selector); + w[12] = __byte_perm (w[11], w[12], selector); + w[11] = __byte_perm (w[10], w[11], selector); + w[10] = __byte_perm (w[ 9], w[10], selector); + w[ 9] = __byte_perm (w[ 8], w[ 9], selector); + w[ 8] = __byte_perm (w[ 7], w[ 8], selector); + w[ 7] = __byte_perm (w[ 6], w[ 7], selector); + w[ 6] = __byte_perm (w[ 5], w[ 6], selector); + w[ 5] = __byte_perm (w[ 4], w[ 5], selector); + w[ 4] = __byte_perm (w[ 3], w[ 4], selector); + w[ 3] = __byte_perm (w[ 2], w[ 3], selector); + w[ 2] = __byte_perm (w[ 1], w[ 2], selector); + w[ 1] = __byte_perm (w[ 0], w[ 1], selector); + w[ 0] = __byte_perm ( 0, w[ 0], selector); + + break; + + case 1: + w[63] = __byte_perm (w[61], w[62], selector); + w[62] = __byte_perm (w[60], w[61], selector); + w[61] = __byte_perm (w[59], w[60], selector); + w[60] = __byte_perm (w[58], w[59], selector); + w[59] = __byte_perm (w[57], w[58], selector); + w[58] = __byte_perm (w[56], w[57], selector); + w[57] = __byte_perm (w[55], w[56], selector); + w[56] = __byte_perm (w[54], w[55], selector); + w[55] = __byte_perm (w[53], w[54], selector); + w[54] = __byte_perm (w[52], w[53], selector); + w[53] = __byte_perm (w[51], w[52], selector); + w[52] = __byte_perm (w[50], w[51], selector); + w[51] = __byte_perm (w[49], w[50], selector); + w[50] = __byte_perm (w[48], w[49], selector); + w[49] = __byte_perm (w[47], w[48], selector); + w[48] = __byte_perm (w[46], w[47], selector); + w[47] = __byte_perm (w[45], w[46], selector); + w[46] = __byte_perm (w[44], w[45], selector); + w[45] = __byte_perm (w[43], w[44], selector); + w[44] = __byte_perm (w[42], w[43], selector); + w[43] = __byte_perm (w[41], w[42], selector); + w[42] = __byte_perm (w[40], w[41], selector); + w[41] = __byte_perm (w[39], w[40], selector); + w[40] = __byte_perm (w[38], w[39], selector); + w[39] = __byte_perm (w[37], w[38], selector); + w[38] = __byte_perm (w[36], w[37], selector); + w[37] = __byte_perm (w[35], w[36], selector); + w[36] = __byte_perm (w[34], w[35], selector); + w[35] = __byte_perm (w[33], w[34], selector); + w[34] = __byte_perm (w[32], w[33], selector); + w[33] = __byte_perm (w[31], w[32], selector); + w[32] = __byte_perm (w[30], w[31], selector); + w[31] = __byte_perm (w[29], w[30], selector); + w[30] = __byte_perm (w[28], w[29], selector); + w[29] = __byte_perm (w[27], w[28], selector); + w[28] = __byte_perm (w[26], w[27], selector); + w[27] = __byte_perm (w[25], w[26], selector); + w[26] = __byte_perm (w[24], w[25], selector); + w[25] = __byte_perm (w[23], w[24], selector); + w[24] = __byte_perm (w[22], w[23], selector); + w[23] = __byte_perm (w[21], w[22], selector); + w[22] = __byte_perm (w[20], w[21], selector); + w[21] = __byte_perm (w[19], w[20], selector); + w[20] = __byte_perm (w[18], w[19], selector); + w[19] = __byte_perm (w[17], w[18], selector); + w[18] = __byte_perm (w[16], w[17], selector); + w[17] = __byte_perm (w[15], w[16], selector); + w[16] = __byte_perm (w[14], w[15], selector); + w[15] = __byte_perm (w[13], w[14], selector); + w[14] = __byte_perm (w[12], w[13], selector); + w[13] = __byte_perm (w[11], w[12], selector); + w[12] = __byte_perm (w[10], w[11], selector); + w[11] = __byte_perm (w[ 9], w[10], selector); + w[10] = __byte_perm (w[ 8], w[ 9], selector); + w[ 9] = __byte_perm (w[ 7], w[ 8], selector); + w[ 8] = __byte_perm (w[ 6], w[ 7], selector); + w[ 7] = __byte_perm (w[ 5], w[ 6], selector); + w[ 6] = __byte_perm (w[ 4], w[ 5], selector); + w[ 5] = __byte_perm (w[ 3], w[ 4], selector); + w[ 4] = __byte_perm (w[ 2], w[ 3], selector); + w[ 3] = __byte_perm (w[ 1], w[ 2], selector); + w[ 2] = __byte_perm (w[ 0], w[ 1], selector); + w[ 1] = __byte_perm ( 0, w[ 0], selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = __byte_perm (w[60], w[61], selector); + w[62] = __byte_perm (w[59], w[60], selector); + w[61] = __byte_perm (w[58], w[59], selector); + w[60] = __byte_perm (w[57], w[58], selector); + w[59] = __byte_perm (w[56], w[57], selector); + w[58] = __byte_perm (w[55], w[56], selector); + w[57] = __byte_perm (w[54], w[55], selector); + w[56] = __byte_perm (w[53], w[54], selector); + w[55] = __byte_perm (w[52], w[53], selector); + w[54] = __byte_perm (w[51], w[52], selector); + w[53] = __byte_perm (w[50], w[51], selector); + w[52] = __byte_perm (w[49], w[50], selector); + w[51] = __byte_perm (w[48], w[49], selector); + w[50] = __byte_perm (w[47], w[48], selector); + w[49] = __byte_perm (w[46], w[47], selector); + w[48] = __byte_perm (w[45], w[46], selector); + w[47] = __byte_perm (w[44], w[45], selector); + w[46] = __byte_perm (w[43], w[44], selector); + w[45] = __byte_perm (w[42], w[43], selector); + w[44] = __byte_perm (w[41], w[42], selector); + w[43] = __byte_perm (w[40], w[41], selector); + w[42] = __byte_perm (w[39], w[40], selector); + w[41] = __byte_perm (w[38], w[39], selector); + w[40] = __byte_perm (w[37], w[38], selector); + w[39] = __byte_perm (w[36], w[37], selector); + w[38] = __byte_perm (w[35], w[36], selector); + w[37] = __byte_perm (w[34], w[35], selector); + w[36] = __byte_perm (w[33], w[34], selector); + w[35] = __byte_perm (w[32], w[33], selector); + w[34] = __byte_perm (w[31], w[32], selector); + w[33] = __byte_perm (w[30], w[31], selector); + w[32] = __byte_perm (w[29], w[30], selector); + w[31] = __byte_perm (w[28], w[29], selector); + w[30] = __byte_perm (w[27], w[28], selector); + w[29] = __byte_perm (w[26], w[27], selector); + w[28] = __byte_perm (w[25], w[26], selector); + w[27] = __byte_perm (w[24], w[25], selector); + w[26] = __byte_perm (w[23], w[24], selector); + w[25] = __byte_perm (w[22], w[23], selector); + w[24] = __byte_perm (w[21], w[22], selector); + w[23] = __byte_perm (w[20], w[21], selector); + w[22] = __byte_perm (w[19], w[20], selector); + w[21] = __byte_perm (w[18], w[19], selector); + w[20] = __byte_perm (w[17], w[18], selector); + w[19] = __byte_perm (w[16], w[17], selector); + w[18] = __byte_perm (w[15], w[16], selector); + w[17] = __byte_perm (w[14], w[15], selector); + w[16] = __byte_perm (w[13], w[14], selector); + w[15] = __byte_perm (w[12], w[13], selector); + w[14] = __byte_perm (w[11], w[12], selector); + w[13] = __byte_perm (w[10], w[11], selector); + w[12] = __byte_perm (w[ 9], w[10], selector); + w[11] = __byte_perm (w[ 8], w[ 9], selector); + w[10] = __byte_perm (w[ 7], w[ 8], selector); + w[ 9] = __byte_perm (w[ 6], w[ 7], selector); + w[ 8] = __byte_perm (w[ 5], w[ 6], selector); + w[ 7] = __byte_perm (w[ 4], w[ 5], selector); + w[ 6] = __byte_perm (w[ 3], w[ 4], selector); + w[ 5] = __byte_perm (w[ 2], w[ 3], selector); + w[ 4] = __byte_perm (w[ 1], w[ 2], selector); + w[ 3] = __byte_perm (w[ 0], w[ 1], selector); + w[ 2] = __byte_perm ( 0, w[ 0], selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = __byte_perm (w[59], w[60], selector); + w[62] = __byte_perm (w[58], w[59], selector); + w[61] = __byte_perm (w[57], w[58], selector); + w[60] = __byte_perm (w[56], w[57], selector); + w[59] = __byte_perm (w[55], w[56], selector); + w[58] = __byte_perm (w[54], w[55], selector); + w[57] = __byte_perm (w[53], w[54], selector); + w[56] = __byte_perm (w[52], w[53], selector); + w[55] = __byte_perm (w[51], w[52], selector); + w[54] = __byte_perm (w[50], w[51], selector); + w[53] = __byte_perm (w[49], w[50], selector); + w[52] = __byte_perm (w[48], w[49], selector); + w[51] = __byte_perm (w[47], w[48], selector); + w[50] = __byte_perm (w[46], w[47], selector); + w[49] = __byte_perm (w[45], w[46], selector); + w[48] = __byte_perm (w[44], w[45], selector); + w[47] = __byte_perm (w[43], w[44], selector); + w[46] = __byte_perm (w[42], w[43], selector); + w[45] = __byte_perm (w[41], w[42], selector); + w[44] = __byte_perm (w[40], w[41], selector); + w[43] = __byte_perm (w[39], w[40], selector); + w[42] = __byte_perm (w[38], w[39], selector); + w[41] = __byte_perm (w[37], w[38], selector); + w[40] = __byte_perm (w[36], w[37], selector); + w[39] = __byte_perm (w[35], w[36], selector); + w[38] = __byte_perm (w[34], w[35], selector); + w[37] = __byte_perm (w[33], w[34], selector); + w[36] = __byte_perm (w[32], w[33], selector); + w[35] = __byte_perm (w[31], w[32], selector); + w[34] = __byte_perm (w[30], w[31], selector); + w[33] = __byte_perm (w[29], w[30], selector); + w[32] = __byte_perm (w[28], w[29], selector); + w[31] = __byte_perm (w[27], w[28], selector); + w[30] = __byte_perm (w[26], w[27], selector); + w[29] = __byte_perm (w[25], w[26], selector); + w[28] = __byte_perm (w[24], w[25], selector); + w[27] = __byte_perm (w[23], w[24], selector); + w[26] = __byte_perm (w[22], w[23], selector); + w[25] = __byte_perm (w[21], w[22], selector); + w[24] = __byte_perm (w[20], w[21], selector); + w[23] = __byte_perm (w[19], w[20], selector); + w[22] = __byte_perm (w[18], w[19], selector); + w[21] = __byte_perm (w[17], w[18], selector); + w[20] = __byte_perm (w[16], w[17], selector); + w[19] = __byte_perm (w[15], w[16], selector); + w[18] = __byte_perm (w[14], w[15], selector); + w[17] = __byte_perm (w[13], w[14], selector); + w[16] = __byte_perm (w[12], w[13], selector); + w[15] = __byte_perm (w[11], w[12], selector); + w[14] = __byte_perm (w[10], w[11], selector); + w[13] = __byte_perm (w[ 9], w[10], selector); + w[12] = __byte_perm (w[ 8], w[ 9], selector); + w[11] = __byte_perm (w[ 7], w[ 8], selector); + w[10] = __byte_perm (w[ 6], w[ 7], selector); + w[ 9] = __byte_perm (w[ 5], w[ 6], selector); + w[ 8] = __byte_perm (w[ 4], w[ 5], selector); + w[ 7] = __byte_perm (w[ 3], w[ 4], selector); + w[ 6] = __byte_perm (w[ 2], w[ 3], selector); + w[ 5] = __byte_perm (w[ 1], w[ 2], selector); + w[ 4] = __byte_perm (w[ 0], w[ 1], selector); + w[ 3] = __byte_perm ( 0, w[ 0], selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = __byte_perm (w[58], w[59], selector); + w[62] = __byte_perm (w[57], w[58], selector); + w[61] = __byte_perm (w[56], w[57], selector); + w[60] = __byte_perm (w[55], w[56], selector); + w[59] = __byte_perm (w[54], w[55], selector); + w[58] = __byte_perm (w[53], w[54], selector); + w[57] = __byte_perm (w[52], w[53], selector); + w[56] = __byte_perm (w[51], w[52], selector); + w[55] = __byte_perm (w[50], w[51], selector); + w[54] = __byte_perm (w[49], w[50], selector); + w[53] = __byte_perm (w[48], w[49], selector); + w[52] = __byte_perm (w[47], w[48], selector); + w[51] = __byte_perm (w[46], w[47], selector); + w[50] = __byte_perm (w[45], w[46], selector); + w[49] = __byte_perm (w[44], w[45], selector); + w[48] = __byte_perm (w[43], w[44], selector); + w[47] = __byte_perm (w[42], w[43], selector); + w[46] = __byte_perm (w[41], w[42], selector); + w[45] = __byte_perm (w[40], w[41], selector); + w[44] = __byte_perm (w[39], w[40], selector); + w[43] = __byte_perm (w[38], w[39], selector); + w[42] = __byte_perm (w[37], w[38], selector); + w[41] = __byte_perm (w[36], w[37], selector); + w[40] = __byte_perm (w[35], w[36], selector); + w[39] = __byte_perm (w[34], w[35], selector); + w[38] = __byte_perm (w[33], w[34], selector); + w[37] = __byte_perm (w[32], w[33], selector); + w[36] = __byte_perm (w[31], w[32], selector); + w[35] = __byte_perm (w[30], w[31], selector); + w[34] = __byte_perm (w[29], w[30], selector); + w[33] = __byte_perm (w[28], w[29], selector); + w[32] = __byte_perm (w[27], w[28], selector); + w[31] = __byte_perm (w[26], w[27], selector); + w[30] = __byte_perm (w[25], w[26], selector); + w[29] = __byte_perm (w[24], w[25], selector); + w[28] = __byte_perm (w[23], w[24], selector); + w[27] = __byte_perm (w[22], w[23], selector); + w[26] = __byte_perm (w[21], w[22], selector); + w[25] = __byte_perm (w[20], w[21], selector); + w[24] = __byte_perm (w[19], w[20], selector); + w[23] = __byte_perm (w[18], w[19], selector); + w[22] = __byte_perm (w[17], w[18], selector); + w[21] = __byte_perm (w[16], w[17], selector); + w[20] = __byte_perm (w[15], w[16], selector); + w[19] = __byte_perm (w[14], w[15], selector); + w[18] = __byte_perm (w[13], w[14], selector); + w[17] = __byte_perm (w[12], w[13], selector); + w[16] = __byte_perm (w[11], w[12], selector); + w[15] = __byte_perm (w[10], w[11], selector); + w[14] = __byte_perm (w[ 9], w[10], selector); + w[13] = __byte_perm (w[ 8], w[ 9], selector); + w[12] = __byte_perm (w[ 7], w[ 8], selector); + w[11] = __byte_perm (w[ 6], w[ 7], selector); + w[10] = __byte_perm (w[ 5], w[ 6], selector); + w[ 9] = __byte_perm (w[ 4], w[ 5], selector); + w[ 8] = __byte_perm (w[ 3], w[ 4], selector); + w[ 7] = __byte_perm (w[ 2], w[ 3], selector); + w[ 6] = __byte_perm (w[ 1], w[ 2], selector); + w[ 5] = __byte_perm (w[ 0], w[ 1], selector); + w[ 4] = __byte_perm ( 0, w[ 0], selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = __byte_perm (w[57], w[58], selector); + w[62] = __byte_perm (w[56], w[57], selector); + w[61] = __byte_perm (w[55], w[56], selector); + w[60] = __byte_perm (w[54], w[55], selector); + w[59] = __byte_perm (w[53], w[54], selector); + w[58] = __byte_perm (w[52], w[53], selector); + w[57] = __byte_perm (w[51], w[52], selector); + w[56] = __byte_perm (w[50], w[51], selector); + w[55] = __byte_perm (w[49], w[50], selector); + w[54] = __byte_perm (w[48], w[49], selector); + w[53] = __byte_perm (w[47], w[48], selector); + w[52] = __byte_perm (w[46], w[47], selector); + w[51] = __byte_perm (w[45], w[46], selector); + w[50] = __byte_perm (w[44], w[45], selector); + w[49] = __byte_perm (w[43], w[44], selector); + w[48] = __byte_perm (w[42], w[43], selector); + w[47] = __byte_perm (w[41], w[42], selector); + w[46] = __byte_perm (w[40], w[41], selector); + w[45] = __byte_perm (w[39], w[40], selector); + w[44] = __byte_perm (w[38], w[39], selector); + w[43] = __byte_perm (w[37], w[38], selector); + w[42] = __byte_perm (w[36], w[37], selector); + w[41] = __byte_perm (w[35], w[36], selector); + w[40] = __byte_perm (w[34], w[35], selector); + w[39] = __byte_perm (w[33], w[34], selector); + w[38] = __byte_perm (w[32], w[33], selector); + w[37] = __byte_perm (w[31], w[32], selector); + w[36] = __byte_perm (w[30], w[31], selector); + w[35] = __byte_perm (w[29], w[30], selector); + w[34] = __byte_perm (w[28], w[29], selector); + w[33] = __byte_perm (w[27], w[28], selector); + w[32] = __byte_perm (w[26], w[27], selector); + w[31] = __byte_perm (w[25], w[26], selector); + w[30] = __byte_perm (w[24], w[25], selector); + w[29] = __byte_perm (w[23], w[24], selector); + w[28] = __byte_perm (w[22], w[23], selector); + w[27] = __byte_perm (w[21], w[22], selector); + w[26] = __byte_perm (w[20], w[21], selector); + w[25] = __byte_perm (w[19], w[20], selector); + w[24] = __byte_perm (w[18], w[19], selector); + w[23] = __byte_perm (w[17], w[18], selector); + w[22] = __byte_perm (w[16], w[17], selector); + w[21] = __byte_perm (w[15], w[16], selector); + w[20] = __byte_perm (w[14], w[15], selector); + w[19] = __byte_perm (w[13], w[14], selector); + w[18] = __byte_perm (w[12], w[13], selector); + w[17] = __byte_perm (w[11], w[12], selector); + w[16] = __byte_perm (w[10], w[11], selector); + w[15] = __byte_perm (w[ 9], w[10], selector); + w[14] = __byte_perm (w[ 8], w[ 9], selector); + w[13] = __byte_perm (w[ 7], w[ 8], selector); + w[12] = __byte_perm (w[ 6], w[ 7], selector); + w[11] = __byte_perm (w[ 5], w[ 6], selector); + w[10] = __byte_perm (w[ 4], w[ 5], selector); + w[ 9] = __byte_perm (w[ 3], w[ 4], selector); + w[ 8] = __byte_perm (w[ 2], w[ 3], selector); + w[ 7] = __byte_perm (w[ 1], w[ 2], selector); + w[ 6] = __byte_perm (w[ 0], w[ 1], selector); + w[ 5] = __byte_perm ( 0, w[ 0], selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = __byte_perm (w[56], w[57], selector); + w[62] = __byte_perm (w[55], w[56], selector); + w[61] = __byte_perm (w[54], w[55], selector); + w[60] = __byte_perm (w[53], w[54], selector); + w[59] = __byte_perm (w[52], w[53], selector); + w[58] = __byte_perm (w[51], w[52], selector); + w[57] = __byte_perm (w[50], w[51], selector); + w[56] = __byte_perm (w[49], w[50], selector); + w[55] = __byte_perm (w[48], w[49], selector); + w[54] = __byte_perm (w[47], w[48], selector); + w[53] = __byte_perm (w[46], w[47], selector); + w[52] = __byte_perm (w[45], w[46], selector); + w[51] = __byte_perm (w[44], w[45], selector); + w[50] = __byte_perm (w[43], w[44], selector); + w[49] = __byte_perm (w[42], w[43], selector); + w[48] = __byte_perm (w[41], w[42], selector); + w[47] = __byte_perm (w[40], w[41], selector); + w[46] = __byte_perm (w[39], w[40], selector); + w[45] = __byte_perm (w[38], w[39], selector); + w[44] = __byte_perm (w[37], w[38], selector); + w[43] = __byte_perm (w[36], w[37], selector); + w[42] = __byte_perm (w[35], w[36], selector); + w[41] = __byte_perm (w[34], w[35], selector); + w[40] = __byte_perm (w[33], w[34], selector); + w[39] = __byte_perm (w[32], w[33], selector); + w[38] = __byte_perm (w[31], w[32], selector); + w[37] = __byte_perm (w[30], w[31], selector); + w[36] = __byte_perm (w[29], w[30], selector); + w[35] = __byte_perm (w[28], w[29], selector); + w[34] = __byte_perm (w[27], w[28], selector); + w[33] = __byte_perm (w[26], w[27], selector); + w[32] = __byte_perm (w[25], w[26], selector); + w[31] = __byte_perm (w[24], w[25], selector); + w[30] = __byte_perm (w[23], w[24], selector); + w[29] = __byte_perm (w[22], w[23], selector); + w[28] = __byte_perm (w[21], w[22], selector); + w[27] = __byte_perm (w[20], w[21], selector); + w[26] = __byte_perm (w[19], w[20], selector); + w[25] = __byte_perm (w[18], w[19], selector); + w[24] = __byte_perm (w[17], w[18], selector); + w[23] = __byte_perm (w[16], w[17], selector); + w[22] = __byte_perm (w[15], w[16], selector); + w[21] = __byte_perm (w[14], w[15], selector); + w[20] = __byte_perm (w[13], w[14], selector); + w[19] = __byte_perm (w[12], w[13], selector); + w[18] = __byte_perm (w[11], w[12], selector); + w[17] = __byte_perm (w[10], w[11], selector); + w[16] = __byte_perm (w[ 9], w[10], selector); + w[15] = __byte_perm (w[ 8], w[ 9], selector); + w[14] = __byte_perm (w[ 7], w[ 8], selector); + w[13] = __byte_perm (w[ 6], w[ 7], selector); + w[12] = __byte_perm (w[ 5], w[ 6], selector); + w[11] = __byte_perm (w[ 4], w[ 5], selector); + w[10] = __byte_perm (w[ 3], w[ 4], selector); + w[ 9] = __byte_perm (w[ 2], w[ 3], selector); + w[ 8] = __byte_perm (w[ 1], w[ 2], selector); + w[ 7] = __byte_perm (w[ 0], w[ 1], selector); + w[ 6] = __byte_perm ( 0, w[ 0], selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = __byte_perm (w[55], w[56], selector); + w[62] = __byte_perm (w[54], w[55], selector); + w[61] = __byte_perm (w[53], w[54], selector); + w[60] = __byte_perm (w[52], w[53], selector); + w[59] = __byte_perm (w[51], w[52], selector); + w[58] = __byte_perm (w[50], w[51], selector); + w[57] = __byte_perm (w[49], w[50], selector); + w[56] = __byte_perm (w[48], w[49], selector); + w[55] = __byte_perm (w[47], w[48], selector); + w[54] = __byte_perm (w[46], w[47], selector); + w[53] = __byte_perm (w[45], w[46], selector); + w[52] = __byte_perm (w[44], w[45], selector); + w[51] = __byte_perm (w[43], w[44], selector); + w[50] = __byte_perm (w[42], w[43], selector); + w[49] = __byte_perm (w[41], w[42], selector); + w[48] = __byte_perm (w[40], w[41], selector); + w[47] = __byte_perm (w[39], w[40], selector); + w[46] = __byte_perm (w[38], w[39], selector); + w[45] = __byte_perm (w[37], w[38], selector); + w[44] = __byte_perm (w[36], w[37], selector); + w[43] = __byte_perm (w[35], w[36], selector); + w[42] = __byte_perm (w[34], w[35], selector); + w[41] = __byte_perm (w[33], w[34], selector); + w[40] = __byte_perm (w[32], w[33], selector); + w[39] = __byte_perm (w[31], w[32], selector); + w[38] = __byte_perm (w[30], w[31], selector); + w[37] = __byte_perm (w[29], w[30], selector); + w[36] = __byte_perm (w[28], w[29], selector); + w[35] = __byte_perm (w[27], w[28], selector); + w[34] = __byte_perm (w[26], w[27], selector); + w[33] = __byte_perm (w[25], w[26], selector); + w[32] = __byte_perm (w[24], w[25], selector); + w[31] = __byte_perm (w[23], w[24], selector); + w[30] = __byte_perm (w[22], w[23], selector); + w[29] = __byte_perm (w[21], w[22], selector); + w[28] = __byte_perm (w[20], w[21], selector); + w[27] = __byte_perm (w[19], w[20], selector); + w[26] = __byte_perm (w[18], w[19], selector); + w[25] = __byte_perm (w[17], w[18], selector); + w[24] = __byte_perm (w[16], w[17], selector); + w[23] = __byte_perm (w[15], w[16], selector); + w[22] = __byte_perm (w[14], w[15], selector); + w[21] = __byte_perm (w[13], w[14], selector); + w[20] = __byte_perm (w[12], w[13], selector); + w[19] = __byte_perm (w[11], w[12], selector); + w[18] = __byte_perm (w[10], w[11], selector); + w[17] = __byte_perm (w[ 9], w[10], selector); + w[16] = __byte_perm (w[ 8], w[ 9], selector); + w[15] = __byte_perm (w[ 7], w[ 8], selector); + w[14] = __byte_perm (w[ 6], w[ 7], selector); + w[13] = __byte_perm (w[ 5], w[ 6], selector); + w[12] = __byte_perm (w[ 4], w[ 5], selector); + w[11] = __byte_perm (w[ 3], w[ 4], selector); + w[10] = __byte_perm (w[ 2], w[ 3], selector); + w[ 9] = __byte_perm (w[ 1], w[ 2], selector); + w[ 8] = __byte_perm (w[ 0], w[ 1], selector); + w[ 7] = __byte_perm ( 0, w[ 0], selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = __byte_perm (w[54], w[55], selector); + w[62] = __byte_perm (w[53], w[54], selector); + w[61] = __byte_perm (w[52], w[53], selector); + w[60] = __byte_perm (w[51], w[52], selector); + w[59] = __byte_perm (w[50], w[51], selector); + w[58] = __byte_perm (w[49], w[50], selector); + w[57] = __byte_perm (w[48], w[49], selector); + w[56] = __byte_perm (w[47], w[48], selector); + w[55] = __byte_perm (w[46], w[47], selector); + w[54] = __byte_perm (w[45], w[46], selector); + w[53] = __byte_perm (w[44], w[45], selector); + w[52] = __byte_perm (w[43], w[44], selector); + w[51] = __byte_perm (w[42], w[43], selector); + w[50] = __byte_perm (w[41], w[42], selector); + w[49] = __byte_perm (w[40], w[41], selector); + w[48] = __byte_perm (w[39], w[40], selector); + w[47] = __byte_perm (w[38], w[39], selector); + w[46] = __byte_perm (w[37], w[38], selector); + w[45] = __byte_perm (w[36], w[37], selector); + w[44] = __byte_perm (w[35], w[36], selector); + w[43] = __byte_perm (w[34], w[35], selector); + w[42] = __byte_perm (w[33], w[34], selector); + w[41] = __byte_perm (w[32], w[33], selector); + w[40] = __byte_perm (w[31], w[32], selector); + w[39] = __byte_perm (w[30], w[31], selector); + w[38] = __byte_perm (w[29], w[30], selector); + w[37] = __byte_perm (w[28], w[29], selector); + w[36] = __byte_perm (w[27], w[28], selector); + w[35] = __byte_perm (w[26], w[27], selector); + w[34] = __byte_perm (w[25], w[26], selector); + w[33] = __byte_perm (w[24], w[25], selector); + w[32] = __byte_perm (w[23], w[24], selector); + w[31] = __byte_perm (w[22], w[23], selector); + w[30] = __byte_perm (w[21], w[22], selector); + w[29] = __byte_perm (w[20], w[21], selector); + w[28] = __byte_perm (w[19], w[20], selector); + w[27] = __byte_perm (w[18], w[19], selector); + w[26] = __byte_perm (w[17], w[18], selector); + w[25] = __byte_perm (w[16], w[17], selector); + w[24] = __byte_perm (w[15], w[16], selector); + w[23] = __byte_perm (w[14], w[15], selector); + w[22] = __byte_perm (w[13], w[14], selector); + w[21] = __byte_perm (w[12], w[13], selector); + w[20] = __byte_perm (w[11], w[12], selector); + w[19] = __byte_perm (w[10], w[11], selector); + w[18] = __byte_perm (w[ 9], w[10], selector); + w[17] = __byte_perm (w[ 8], w[ 9], selector); + w[16] = __byte_perm (w[ 7], w[ 8], selector); + w[15] = __byte_perm (w[ 6], w[ 7], selector); + w[14] = __byte_perm (w[ 5], w[ 6], selector); + w[13] = __byte_perm (w[ 4], w[ 5], selector); + w[12] = __byte_perm (w[ 3], w[ 4], selector); + w[11] = __byte_perm (w[ 2], w[ 3], selector); + w[10] = __byte_perm (w[ 1], w[ 2], selector); + w[ 9] = __byte_perm (w[ 0], w[ 1], selector); + w[ 8] = __byte_perm ( 0, w[ 0], selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = __byte_perm (w[53], w[54], selector); + w[62] = __byte_perm (w[52], w[53], selector); + w[61] = __byte_perm (w[51], w[52], selector); + w[60] = __byte_perm (w[50], w[51], selector); + w[59] = __byte_perm (w[49], w[50], selector); + w[58] = __byte_perm (w[48], w[49], selector); + w[57] = __byte_perm (w[47], w[48], selector); + w[56] = __byte_perm (w[46], w[47], selector); + w[55] = __byte_perm (w[45], w[46], selector); + w[54] = __byte_perm (w[44], w[45], selector); + w[53] = __byte_perm (w[43], w[44], selector); + w[52] = __byte_perm (w[42], w[43], selector); + w[51] = __byte_perm (w[41], w[42], selector); + w[50] = __byte_perm (w[40], w[41], selector); + w[49] = __byte_perm (w[39], w[40], selector); + w[48] = __byte_perm (w[38], w[39], selector); + w[47] = __byte_perm (w[37], w[38], selector); + w[46] = __byte_perm (w[36], w[37], selector); + w[45] = __byte_perm (w[35], w[36], selector); + w[44] = __byte_perm (w[34], w[35], selector); + w[43] = __byte_perm (w[33], w[34], selector); + w[42] = __byte_perm (w[32], w[33], selector); + w[41] = __byte_perm (w[31], w[32], selector); + w[40] = __byte_perm (w[30], w[31], selector); + w[39] = __byte_perm (w[29], w[30], selector); + w[38] = __byte_perm (w[28], w[29], selector); + w[37] = __byte_perm (w[27], w[28], selector); + w[36] = __byte_perm (w[26], w[27], selector); + w[35] = __byte_perm (w[25], w[26], selector); + w[34] = __byte_perm (w[24], w[25], selector); + w[33] = __byte_perm (w[23], w[24], selector); + w[32] = __byte_perm (w[22], w[23], selector); + w[31] = __byte_perm (w[21], w[22], selector); + w[30] = __byte_perm (w[20], w[21], selector); + w[29] = __byte_perm (w[19], w[20], selector); + w[28] = __byte_perm (w[18], w[19], selector); + w[27] = __byte_perm (w[17], w[18], selector); + w[26] = __byte_perm (w[16], w[17], selector); + w[25] = __byte_perm (w[15], w[16], selector); + w[24] = __byte_perm (w[14], w[15], selector); + w[23] = __byte_perm (w[13], w[14], selector); + w[22] = __byte_perm (w[12], w[13], selector); + w[21] = __byte_perm (w[11], w[12], selector); + w[20] = __byte_perm (w[10], w[11], selector); + w[19] = __byte_perm (w[ 9], w[10], selector); + w[18] = __byte_perm (w[ 8], w[ 9], selector); + w[17] = __byte_perm (w[ 7], w[ 8], selector); + w[16] = __byte_perm (w[ 6], w[ 7], selector); + w[15] = __byte_perm (w[ 5], w[ 6], selector); + w[14] = __byte_perm (w[ 4], w[ 5], selector); + w[13] = __byte_perm (w[ 3], w[ 4], selector); + w[12] = __byte_perm (w[ 2], w[ 3], selector); + w[11] = __byte_perm (w[ 1], w[ 2], selector); + w[10] = __byte_perm (w[ 0], w[ 1], selector); + w[ 9] = __byte_perm ( 0, w[ 0], selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = __byte_perm (w[52], w[53], selector); + w[62] = __byte_perm (w[51], w[52], selector); + w[61] = __byte_perm (w[50], w[51], selector); + w[60] = __byte_perm (w[49], w[50], selector); + w[59] = __byte_perm (w[48], w[49], selector); + w[58] = __byte_perm (w[47], w[48], selector); + w[57] = __byte_perm (w[46], w[47], selector); + w[56] = __byte_perm (w[45], w[46], selector); + w[55] = __byte_perm (w[44], w[45], selector); + w[54] = __byte_perm (w[43], w[44], selector); + w[53] = __byte_perm (w[42], w[43], selector); + w[52] = __byte_perm (w[41], w[42], selector); + w[51] = __byte_perm (w[40], w[41], selector); + w[50] = __byte_perm (w[39], w[40], selector); + w[49] = __byte_perm (w[38], w[39], selector); + w[48] = __byte_perm (w[37], w[38], selector); + w[47] = __byte_perm (w[36], w[37], selector); + w[46] = __byte_perm (w[35], w[36], selector); + w[45] = __byte_perm (w[34], w[35], selector); + w[44] = __byte_perm (w[33], w[34], selector); + w[43] = __byte_perm (w[32], w[33], selector); + w[42] = __byte_perm (w[31], w[32], selector); + w[41] = __byte_perm (w[30], w[31], selector); + w[40] = __byte_perm (w[29], w[30], selector); + w[39] = __byte_perm (w[28], w[29], selector); + w[38] = __byte_perm (w[27], w[28], selector); + w[37] = __byte_perm (w[26], w[27], selector); + w[36] = __byte_perm (w[25], w[26], selector); + w[35] = __byte_perm (w[24], w[25], selector); + w[34] = __byte_perm (w[23], w[24], selector); + w[33] = __byte_perm (w[22], w[23], selector); + w[32] = __byte_perm (w[21], w[22], selector); + w[31] = __byte_perm (w[20], w[21], selector); + w[30] = __byte_perm (w[19], w[20], selector); + w[29] = __byte_perm (w[18], w[19], selector); + w[28] = __byte_perm (w[17], w[18], selector); + w[27] = __byte_perm (w[16], w[17], selector); + w[26] = __byte_perm (w[15], w[16], selector); + w[25] = __byte_perm (w[14], w[15], selector); + w[24] = __byte_perm (w[13], w[14], selector); + w[23] = __byte_perm (w[12], w[13], selector); + w[22] = __byte_perm (w[11], w[12], selector); + w[21] = __byte_perm (w[10], w[11], selector); + w[20] = __byte_perm (w[ 9], w[10], selector); + w[19] = __byte_perm (w[ 8], w[ 9], selector); + w[18] = __byte_perm (w[ 7], w[ 8], selector); + w[17] = __byte_perm (w[ 6], w[ 7], selector); + w[16] = __byte_perm (w[ 5], w[ 6], selector); + w[15] = __byte_perm (w[ 4], w[ 5], selector); + w[14] = __byte_perm (w[ 3], w[ 4], selector); + w[13] = __byte_perm (w[ 2], w[ 3], selector); + w[12] = __byte_perm (w[ 1], w[ 2], selector); + w[11] = __byte_perm (w[ 0], w[ 1], selector); + w[10] = __byte_perm ( 0, w[ 0], selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = __byte_perm (w[51], w[52], selector); + w[62] = __byte_perm (w[50], w[51], selector); + w[61] = __byte_perm (w[49], w[50], selector); + w[60] = __byte_perm (w[48], w[49], selector); + w[59] = __byte_perm (w[47], w[48], selector); + w[58] = __byte_perm (w[46], w[47], selector); + w[57] = __byte_perm (w[45], w[46], selector); + w[56] = __byte_perm (w[44], w[45], selector); + w[55] = __byte_perm (w[43], w[44], selector); + w[54] = __byte_perm (w[42], w[43], selector); + w[53] = __byte_perm (w[41], w[42], selector); + w[52] = __byte_perm (w[40], w[41], selector); + w[51] = __byte_perm (w[39], w[40], selector); + w[50] = __byte_perm (w[38], w[39], selector); + w[49] = __byte_perm (w[37], w[38], selector); + w[48] = __byte_perm (w[36], w[37], selector); + w[47] = __byte_perm (w[35], w[36], selector); + w[46] = __byte_perm (w[34], w[35], selector); + w[45] = __byte_perm (w[33], w[34], selector); + w[44] = __byte_perm (w[32], w[33], selector); + w[43] = __byte_perm (w[31], w[32], selector); + w[42] = __byte_perm (w[30], w[31], selector); + w[41] = __byte_perm (w[29], w[30], selector); + w[40] = __byte_perm (w[28], w[29], selector); + w[39] = __byte_perm (w[27], w[28], selector); + w[38] = __byte_perm (w[26], w[27], selector); + w[37] = __byte_perm (w[25], w[26], selector); + w[36] = __byte_perm (w[24], w[25], selector); + w[35] = __byte_perm (w[23], w[24], selector); + w[34] = __byte_perm (w[22], w[23], selector); + w[33] = __byte_perm (w[21], w[22], selector); + w[32] = __byte_perm (w[20], w[21], selector); + w[31] = __byte_perm (w[19], w[20], selector); + w[30] = __byte_perm (w[18], w[19], selector); + w[29] = __byte_perm (w[17], w[18], selector); + w[28] = __byte_perm (w[16], w[17], selector); + w[27] = __byte_perm (w[15], w[16], selector); + w[26] = __byte_perm (w[14], w[15], selector); + w[25] = __byte_perm (w[13], w[14], selector); + w[24] = __byte_perm (w[12], w[13], selector); + w[23] = __byte_perm (w[11], w[12], selector); + w[22] = __byte_perm (w[10], w[11], selector); + w[21] = __byte_perm (w[ 9], w[10], selector); + w[20] = __byte_perm (w[ 8], w[ 9], selector); + w[19] = __byte_perm (w[ 7], w[ 8], selector); + w[18] = __byte_perm (w[ 6], w[ 7], selector); + w[17] = __byte_perm (w[ 5], w[ 6], selector); + w[16] = __byte_perm (w[ 4], w[ 5], selector); + w[15] = __byte_perm (w[ 3], w[ 4], selector); + w[14] = __byte_perm (w[ 2], w[ 3], selector); + w[13] = __byte_perm (w[ 1], w[ 2], selector); + w[12] = __byte_perm (w[ 0], w[ 1], selector); + w[11] = __byte_perm ( 0, w[ 0], selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = __byte_perm (w[50], w[51], selector); + w[62] = __byte_perm (w[49], w[50], selector); + w[61] = __byte_perm (w[48], w[49], selector); + w[60] = __byte_perm (w[47], w[48], selector); + w[59] = __byte_perm (w[46], w[47], selector); + w[58] = __byte_perm (w[45], w[46], selector); + w[57] = __byte_perm (w[44], w[45], selector); + w[56] = __byte_perm (w[43], w[44], selector); + w[55] = __byte_perm (w[42], w[43], selector); + w[54] = __byte_perm (w[41], w[42], selector); + w[53] = __byte_perm (w[40], w[41], selector); + w[52] = __byte_perm (w[39], w[40], selector); + w[51] = __byte_perm (w[38], w[39], selector); + w[50] = __byte_perm (w[37], w[38], selector); + w[49] = __byte_perm (w[36], w[37], selector); + w[48] = __byte_perm (w[35], w[36], selector); + w[47] = __byte_perm (w[34], w[35], selector); + w[46] = __byte_perm (w[33], w[34], selector); + w[45] = __byte_perm (w[32], w[33], selector); + w[44] = __byte_perm (w[31], w[32], selector); + w[43] = __byte_perm (w[30], w[31], selector); + w[42] = __byte_perm (w[29], w[30], selector); + w[41] = __byte_perm (w[28], w[29], selector); + w[40] = __byte_perm (w[27], w[28], selector); + w[39] = __byte_perm (w[26], w[27], selector); + w[38] = __byte_perm (w[25], w[26], selector); + w[37] = __byte_perm (w[24], w[25], selector); + w[36] = __byte_perm (w[23], w[24], selector); + w[35] = __byte_perm (w[22], w[23], selector); + w[34] = __byte_perm (w[21], w[22], selector); + w[33] = __byte_perm (w[20], w[21], selector); + w[32] = __byte_perm (w[19], w[20], selector); + w[31] = __byte_perm (w[18], w[19], selector); + w[30] = __byte_perm (w[17], w[18], selector); + w[29] = __byte_perm (w[16], w[17], selector); + w[28] = __byte_perm (w[15], w[16], selector); + w[27] = __byte_perm (w[14], w[15], selector); + w[26] = __byte_perm (w[13], w[14], selector); + w[25] = __byte_perm (w[12], w[13], selector); + w[24] = __byte_perm (w[11], w[12], selector); + w[23] = __byte_perm (w[10], w[11], selector); + w[22] = __byte_perm (w[ 9], w[10], selector); + w[21] = __byte_perm (w[ 8], w[ 9], selector); + w[20] = __byte_perm (w[ 7], w[ 8], selector); + w[19] = __byte_perm (w[ 6], w[ 7], selector); + w[18] = __byte_perm (w[ 5], w[ 6], selector); + w[17] = __byte_perm (w[ 4], w[ 5], selector); + w[16] = __byte_perm (w[ 3], w[ 4], selector); + w[15] = __byte_perm (w[ 2], w[ 3], selector); + w[14] = __byte_perm (w[ 1], w[ 2], selector); + w[13] = __byte_perm (w[ 0], w[ 1], selector); + w[12] = __byte_perm ( 0, w[ 0], selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = __byte_perm (w[49], w[50], selector); + w[62] = __byte_perm (w[48], w[49], selector); + w[61] = __byte_perm (w[47], w[48], selector); + w[60] = __byte_perm (w[46], w[47], selector); + w[59] = __byte_perm (w[45], w[46], selector); + w[58] = __byte_perm (w[44], w[45], selector); + w[57] = __byte_perm (w[43], w[44], selector); + w[56] = __byte_perm (w[42], w[43], selector); + w[55] = __byte_perm (w[41], w[42], selector); + w[54] = __byte_perm (w[40], w[41], selector); + w[53] = __byte_perm (w[39], w[40], selector); + w[52] = __byte_perm (w[38], w[39], selector); + w[51] = __byte_perm (w[37], w[38], selector); + w[50] = __byte_perm (w[36], w[37], selector); + w[49] = __byte_perm (w[35], w[36], selector); + w[48] = __byte_perm (w[34], w[35], selector); + w[47] = __byte_perm (w[33], w[34], selector); + w[46] = __byte_perm (w[32], w[33], selector); + w[45] = __byte_perm (w[31], w[32], selector); + w[44] = __byte_perm (w[30], w[31], selector); + w[43] = __byte_perm (w[29], w[30], selector); + w[42] = __byte_perm (w[28], w[29], selector); + w[41] = __byte_perm (w[27], w[28], selector); + w[40] = __byte_perm (w[26], w[27], selector); + w[39] = __byte_perm (w[25], w[26], selector); + w[38] = __byte_perm (w[24], w[25], selector); + w[37] = __byte_perm (w[23], w[24], selector); + w[36] = __byte_perm (w[22], w[23], selector); + w[35] = __byte_perm (w[21], w[22], selector); + w[34] = __byte_perm (w[20], w[21], selector); + w[33] = __byte_perm (w[19], w[20], selector); + w[32] = __byte_perm (w[18], w[19], selector); + w[31] = __byte_perm (w[17], w[18], selector); + w[30] = __byte_perm (w[16], w[17], selector); + w[29] = __byte_perm (w[15], w[16], selector); + w[28] = __byte_perm (w[14], w[15], selector); + w[27] = __byte_perm (w[13], w[14], selector); + w[26] = __byte_perm (w[12], w[13], selector); + w[25] = __byte_perm (w[11], w[12], selector); + w[24] = __byte_perm (w[10], w[11], selector); + w[23] = __byte_perm (w[ 9], w[10], selector); + w[22] = __byte_perm (w[ 8], w[ 9], selector); + w[21] = __byte_perm (w[ 7], w[ 8], selector); + w[20] = __byte_perm (w[ 6], w[ 7], selector); + w[19] = __byte_perm (w[ 5], w[ 6], selector); + w[18] = __byte_perm (w[ 4], w[ 5], selector); + w[17] = __byte_perm (w[ 3], w[ 4], selector); + w[16] = __byte_perm (w[ 2], w[ 3], selector); + w[15] = __byte_perm (w[ 1], w[ 2], selector); + w[14] = __byte_perm (w[ 0], w[ 1], selector); + w[13] = __byte_perm ( 0, w[ 0], selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = __byte_perm (w[48], w[49], selector); + w[62] = __byte_perm (w[47], w[48], selector); + w[61] = __byte_perm (w[46], w[47], selector); + w[60] = __byte_perm (w[45], w[46], selector); + w[59] = __byte_perm (w[44], w[45], selector); + w[58] = __byte_perm (w[43], w[44], selector); + w[57] = __byte_perm (w[42], w[43], selector); + w[56] = __byte_perm (w[41], w[42], selector); + w[55] = __byte_perm (w[40], w[41], selector); + w[54] = __byte_perm (w[39], w[40], selector); + w[53] = __byte_perm (w[38], w[39], selector); + w[52] = __byte_perm (w[37], w[38], selector); + w[51] = __byte_perm (w[36], w[37], selector); + w[50] = __byte_perm (w[35], w[36], selector); + w[49] = __byte_perm (w[34], w[35], selector); + w[48] = __byte_perm (w[33], w[34], selector); + w[47] = __byte_perm (w[32], w[33], selector); + w[46] = __byte_perm (w[31], w[32], selector); + w[45] = __byte_perm (w[30], w[31], selector); + w[44] = __byte_perm (w[29], w[30], selector); + w[43] = __byte_perm (w[28], w[29], selector); + w[42] = __byte_perm (w[27], w[28], selector); + w[41] = __byte_perm (w[26], w[27], selector); + w[40] = __byte_perm (w[25], w[26], selector); + w[39] = __byte_perm (w[24], w[25], selector); + w[38] = __byte_perm (w[23], w[24], selector); + w[37] = __byte_perm (w[22], w[23], selector); + w[36] = __byte_perm (w[21], w[22], selector); + w[35] = __byte_perm (w[20], w[21], selector); + w[34] = __byte_perm (w[19], w[20], selector); + w[33] = __byte_perm (w[18], w[19], selector); + w[32] = __byte_perm (w[17], w[18], selector); + w[31] = __byte_perm (w[16], w[17], selector); + w[30] = __byte_perm (w[15], w[16], selector); + w[29] = __byte_perm (w[14], w[15], selector); + w[28] = __byte_perm (w[13], w[14], selector); + w[27] = __byte_perm (w[12], w[13], selector); + w[26] = __byte_perm (w[11], w[12], selector); + w[25] = __byte_perm (w[10], w[11], selector); + w[24] = __byte_perm (w[ 9], w[10], selector); + w[23] = __byte_perm (w[ 8], w[ 9], selector); + w[22] = __byte_perm (w[ 7], w[ 8], selector); + w[21] = __byte_perm (w[ 6], w[ 7], selector); + w[20] = __byte_perm (w[ 5], w[ 6], selector); + w[19] = __byte_perm (w[ 4], w[ 5], selector); + w[18] = __byte_perm (w[ 3], w[ 4], selector); + w[17] = __byte_perm (w[ 2], w[ 3], selector); + w[16] = __byte_perm (w[ 1], w[ 2], selector); + w[15] = __byte_perm (w[ 0], w[ 1], selector); + w[14] = __byte_perm ( 0, w[ 0], selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = __byte_perm (w[47], w[48], selector); + w[62] = __byte_perm (w[46], w[47], selector); + w[61] = __byte_perm (w[45], w[46], selector); + w[60] = __byte_perm (w[44], w[45], selector); + w[59] = __byte_perm (w[43], w[44], selector); + w[58] = __byte_perm (w[42], w[43], selector); + w[57] = __byte_perm (w[41], w[42], selector); + w[56] = __byte_perm (w[40], w[41], selector); + w[55] = __byte_perm (w[39], w[40], selector); + w[54] = __byte_perm (w[38], w[39], selector); + w[53] = __byte_perm (w[37], w[38], selector); + w[52] = __byte_perm (w[36], w[37], selector); + w[51] = __byte_perm (w[35], w[36], selector); + w[50] = __byte_perm (w[34], w[35], selector); + w[49] = __byte_perm (w[33], w[34], selector); + w[48] = __byte_perm (w[32], w[33], selector); + w[47] = __byte_perm (w[31], w[32], selector); + w[46] = __byte_perm (w[30], w[31], selector); + w[45] = __byte_perm (w[29], w[30], selector); + w[44] = __byte_perm (w[28], w[29], selector); + w[43] = __byte_perm (w[27], w[28], selector); + w[42] = __byte_perm (w[26], w[27], selector); + w[41] = __byte_perm (w[25], w[26], selector); + w[40] = __byte_perm (w[24], w[25], selector); + w[39] = __byte_perm (w[23], w[24], selector); + w[38] = __byte_perm (w[22], w[23], selector); + w[37] = __byte_perm (w[21], w[22], selector); + w[36] = __byte_perm (w[20], w[21], selector); + w[35] = __byte_perm (w[19], w[20], selector); + w[34] = __byte_perm (w[18], w[19], selector); + w[33] = __byte_perm (w[17], w[18], selector); + w[32] = __byte_perm (w[16], w[17], selector); + w[31] = __byte_perm (w[15], w[16], selector); + w[30] = __byte_perm (w[14], w[15], selector); + w[29] = __byte_perm (w[13], w[14], selector); + w[28] = __byte_perm (w[12], w[13], selector); + w[27] = __byte_perm (w[11], w[12], selector); + w[26] = __byte_perm (w[10], w[11], selector); + w[25] = __byte_perm (w[ 9], w[10], selector); + w[24] = __byte_perm (w[ 8], w[ 9], selector); + w[23] = __byte_perm (w[ 7], w[ 8], selector); + w[22] = __byte_perm (w[ 6], w[ 7], selector); + w[21] = __byte_perm (w[ 5], w[ 6], selector); + w[20] = __byte_perm (w[ 4], w[ 5], selector); + w[19] = __byte_perm (w[ 3], w[ 4], selector); + w[18] = __byte_perm (w[ 2], w[ 3], selector); + w[17] = __byte_perm (w[ 1], w[ 2], selector); + w[16] = __byte_perm (w[ 0], w[ 1], selector); + w[15] = __byte_perm ( 0, w[ 0], selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = __byte_perm (w[46], w[47], selector); + w[62] = __byte_perm (w[45], w[46], selector); + w[61] = __byte_perm (w[44], w[45], selector); + w[60] = __byte_perm (w[43], w[44], selector); + w[59] = __byte_perm (w[42], w[43], selector); + w[58] = __byte_perm (w[41], w[42], selector); + w[57] = __byte_perm (w[40], w[41], selector); + w[56] = __byte_perm (w[39], w[40], selector); + w[55] = __byte_perm (w[38], w[39], selector); + w[54] = __byte_perm (w[37], w[38], selector); + w[53] = __byte_perm (w[36], w[37], selector); + w[52] = __byte_perm (w[35], w[36], selector); + w[51] = __byte_perm (w[34], w[35], selector); + w[50] = __byte_perm (w[33], w[34], selector); + w[49] = __byte_perm (w[32], w[33], selector); + w[48] = __byte_perm (w[31], w[32], selector); + w[47] = __byte_perm (w[30], w[31], selector); + w[46] = __byte_perm (w[29], w[30], selector); + w[45] = __byte_perm (w[28], w[29], selector); + w[44] = __byte_perm (w[27], w[28], selector); + w[43] = __byte_perm (w[26], w[27], selector); + w[42] = __byte_perm (w[25], w[26], selector); + w[41] = __byte_perm (w[24], w[25], selector); + w[40] = __byte_perm (w[23], w[24], selector); + w[39] = __byte_perm (w[22], w[23], selector); + w[38] = __byte_perm (w[21], w[22], selector); + w[37] = __byte_perm (w[20], w[21], selector); + w[36] = __byte_perm (w[19], w[20], selector); + w[35] = __byte_perm (w[18], w[19], selector); + w[34] = __byte_perm (w[17], w[18], selector); + w[33] = __byte_perm (w[16], w[17], selector); + w[32] = __byte_perm (w[15], w[16], selector); + w[31] = __byte_perm (w[14], w[15], selector); + w[30] = __byte_perm (w[13], w[14], selector); + w[29] = __byte_perm (w[12], w[13], selector); + w[28] = __byte_perm (w[11], w[12], selector); + w[27] = __byte_perm (w[10], w[11], selector); + w[26] = __byte_perm (w[ 9], w[10], selector); + w[25] = __byte_perm (w[ 8], w[ 9], selector); + w[24] = __byte_perm (w[ 7], w[ 8], selector); + w[23] = __byte_perm (w[ 6], w[ 7], selector); + w[22] = __byte_perm (w[ 5], w[ 6], selector); + w[21] = __byte_perm (w[ 4], w[ 5], selector); + w[20] = __byte_perm (w[ 3], w[ 4], selector); + w[19] = __byte_perm (w[ 2], w[ 3], selector); + w[18] = __byte_perm (w[ 1], w[ 2], selector); + w[17] = __byte_perm (w[ 0], w[ 1], selector); + w[16] = __byte_perm ( 0, w[ 0], selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = __byte_perm (w[45], w[46], selector); + w[62] = __byte_perm (w[44], w[45], selector); + w[61] = __byte_perm (w[43], w[44], selector); + w[60] = __byte_perm (w[42], w[43], selector); + w[59] = __byte_perm (w[41], w[42], selector); + w[58] = __byte_perm (w[40], w[41], selector); + w[57] = __byte_perm (w[39], w[40], selector); + w[56] = __byte_perm (w[38], w[39], selector); + w[55] = __byte_perm (w[37], w[38], selector); + w[54] = __byte_perm (w[36], w[37], selector); + w[53] = __byte_perm (w[35], w[36], selector); + w[52] = __byte_perm (w[34], w[35], selector); + w[51] = __byte_perm (w[33], w[34], selector); + w[50] = __byte_perm (w[32], w[33], selector); + w[49] = __byte_perm (w[31], w[32], selector); + w[48] = __byte_perm (w[30], w[31], selector); + w[47] = __byte_perm (w[29], w[30], selector); + w[46] = __byte_perm (w[28], w[29], selector); + w[45] = __byte_perm (w[27], w[28], selector); + w[44] = __byte_perm (w[26], w[27], selector); + w[43] = __byte_perm (w[25], w[26], selector); + w[42] = __byte_perm (w[24], w[25], selector); + w[41] = __byte_perm (w[23], w[24], selector); + w[40] = __byte_perm (w[22], w[23], selector); + w[39] = __byte_perm (w[21], w[22], selector); + w[38] = __byte_perm (w[20], w[21], selector); + w[37] = __byte_perm (w[19], w[20], selector); + w[36] = __byte_perm (w[18], w[19], selector); + w[35] = __byte_perm (w[17], w[18], selector); + w[34] = __byte_perm (w[16], w[17], selector); + w[33] = __byte_perm (w[15], w[16], selector); + w[32] = __byte_perm (w[14], w[15], selector); + w[31] = __byte_perm (w[13], w[14], selector); + w[30] = __byte_perm (w[12], w[13], selector); + w[29] = __byte_perm (w[11], w[12], selector); + w[28] = __byte_perm (w[10], w[11], selector); + w[27] = __byte_perm (w[ 9], w[10], selector); + w[26] = __byte_perm (w[ 8], w[ 9], selector); + w[25] = __byte_perm (w[ 7], w[ 8], selector); + w[24] = __byte_perm (w[ 6], w[ 7], selector); + w[23] = __byte_perm (w[ 5], w[ 6], selector); + w[22] = __byte_perm (w[ 4], w[ 5], selector); + w[21] = __byte_perm (w[ 3], w[ 4], selector); + w[20] = __byte_perm (w[ 2], w[ 3], selector); + w[19] = __byte_perm (w[ 1], w[ 2], selector); + w[18] = __byte_perm (w[ 0], w[ 1], selector); + w[17] = __byte_perm ( 0, w[ 0], selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = __byte_perm (w[44], w[45], selector); + w[62] = __byte_perm (w[43], w[44], selector); + w[61] = __byte_perm (w[42], w[43], selector); + w[60] = __byte_perm (w[41], w[42], selector); + w[59] = __byte_perm (w[40], w[41], selector); + w[58] = __byte_perm (w[39], w[40], selector); + w[57] = __byte_perm (w[38], w[39], selector); + w[56] = __byte_perm (w[37], w[38], selector); + w[55] = __byte_perm (w[36], w[37], selector); + w[54] = __byte_perm (w[35], w[36], selector); + w[53] = __byte_perm (w[34], w[35], selector); + w[52] = __byte_perm (w[33], w[34], selector); + w[51] = __byte_perm (w[32], w[33], selector); + w[50] = __byte_perm (w[31], w[32], selector); + w[49] = __byte_perm (w[30], w[31], selector); + w[48] = __byte_perm (w[29], w[30], selector); + w[47] = __byte_perm (w[28], w[29], selector); + w[46] = __byte_perm (w[27], w[28], selector); + w[45] = __byte_perm (w[26], w[27], selector); + w[44] = __byte_perm (w[25], w[26], selector); + w[43] = __byte_perm (w[24], w[25], selector); + w[42] = __byte_perm (w[23], w[24], selector); + w[41] = __byte_perm (w[22], w[23], selector); + w[40] = __byte_perm (w[21], w[22], selector); + w[39] = __byte_perm (w[20], w[21], selector); + w[38] = __byte_perm (w[19], w[20], selector); + w[37] = __byte_perm (w[18], w[19], selector); + w[36] = __byte_perm (w[17], w[18], selector); + w[35] = __byte_perm (w[16], w[17], selector); + w[34] = __byte_perm (w[15], w[16], selector); + w[33] = __byte_perm (w[14], w[15], selector); + w[32] = __byte_perm (w[13], w[14], selector); + w[31] = __byte_perm (w[12], w[13], selector); + w[30] = __byte_perm (w[11], w[12], selector); + w[29] = __byte_perm (w[10], w[11], selector); + w[28] = __byte_perm (w[ 9], w[10], selector); + w[27] = __byte_perm (w[ 8], w[ 9], selector); + w[26] = __byte_perm (w[ 7], w[ 8], selector); + w[25] = __byte_perm (w[ 6], w[ 7], selector); + w[24] = __byte_perm (w[ 5], w[ 6], selector); + w[23] = __byte_perm (w[ 4], w[ 5], selector); + w[22] = __byte_perm (w[ 3], w[ 4], selector); + w[21] = __byte_perm (w[ 2], w[ 3], selector); + w[20] = __byte_perm (w[ 1], w[ 2], selector); + w[19] = __byte_perm (w[ 0], w[ 1], selector); + w[18] = __byte_perm ( 0, w[ 0], selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = __byte_perm (w[43], w[44], selector); + w[62] = __byte_perm (w[42], w[43], selector); + w[61] = __byte_perm (w[41], w[42], selector); + w[60] = __byte_perm (w[40], w[41], selector); + w[59] = __byte_perm (w[39], w[40], selector); + w[58] = __byte_perm (w[38], w[39], selector); + w[57] = __byte_perm (w[37], w[38], selector); + w[56] = __byte_perm (w[36], w[37], selector); + w[55] = __byte_perm (w[35], w[36], selector); + w[54] = __byte_perm (w[34], w[35], selector); + w[53] = __byte_perm (w[33], w[34], selector); + w[52] = __byte_perm (w[32], w[33], selector); + w[51] = __byte_perm (w[31], w[32], selector); + w[50] = __byte_perm (w[30], w[31], selector); + w[49] = __byte_perm (w[29], w[30], selector); + w[48] = __byte_perm (w[28], w[29], selector); + w[47] = __byte_perm (w[27], w[28], selector); + w[46] = __byte_perm (w[26], w[27], selector); + w[45] = __byte_perm (w[25], w[26], selector); + w[44] = __byte_perm (w[24], w[25], selector); + w[43] = __byte_perm (w[23], w[24], selector); + w[42] = __byte_perm (w[22], w[23], selector); + w[41] = __byte_perm (w[21], w[22], selector); + w[40] = __byte_perm (w[20], w[21], selector); + w[39] = __byte_perm (w[19], w[20], selector); + w[38] = __byte_perm (w[18], w[19], selector); + w[37] = __byte_perm (w[17], w[18], selector); + w[36] = __byte_perm (w[16], w[17], selector); + w[35] = __byte_perm (w[15], w[16], selector); + w[34] = __byte_perm (w[14], w[15], selector); + w[33] = __byte_perm (w[13], w[14], selector); + w[32] = __byte_perm (w[12], w[13], selector); + w[31] = __byte_perm (w[11], w[12], selector); + w[30] = __byte_perm (w[10], w[11], selector); + w[29] = __byte_perm (w[ 9], w[10], selector); + w[28] = __byte_perm (w[ 8], w[ 9], selector); + w[27] = __byte_perm (w[ 7], w[ 8], selector); + w[26] = __byte_perm (w[ 6], w[ 7], selector); + w[25] = __byte_perm (w[ 5], w[ 6], selector); + w[24] = __byte_perm (w[ 4], w[ 5], selector); + w[23] = __byte_perm (w[ 3], w[ 4], selector); + w[22] = __byte_perm (w[ 2], w[ 3], selector); + w[21] = __byte_perm (w[ 1], w[ 2], selector); + w[20] = __byte_perm (w[ 0], w[ 1], selector); + w[19] = __byte_perm ( 0, w[ 0], selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = __byte_perm (w[42], w[43], selector); + w[62] = __byte_perm (w[41], w[42], selector); + w[61] = __byte_perm (w[40], w[41], selector); + w[60] = __byte_perm (w[39], w[40], selector); + w[59] = __byte_perm (w[38], w[39], selector); + w[58] = __byte_perm (w[37], w[38], selector); + w[57] = __byte_perm (w[36], w[37], selector); + w[56] = __byte_perm (w[35], w[36], selector); + w[55] = __byte_perm (w[34], w[35], selector); + w[54] = __byte_perm (w[33], w[34], selector); + w[53] = __byte_perm (w[32], w[33], selector); + w[52] = __byte_perm (w[31], w[32], selector); + w[51] = __byte_perm (w[30], w[31], selector); + w[50] = __byte_perm (w[29], w[30], selector); + w[49] = __byte_perm (w[28], w[29], selector); + w[48] = __byte_perm (w[27], w[28], selector); + w[47] = __byte_perm (w[26], w[27], selector); + w[46] = __byte_perm (w[25], w[26], selector); + w[45] = __byte_perm (w[24], w[25], selector); + w[44] = __byte_perm (w[23], w[24], selector); + w[43] = __byte_perm (w[22], w[23], selector); + w[42] = __byte_perm (w[21], w[22], selector); + w[41] = __byte_perm (w[20], w[21], selector); + w[40] = __byte_perm (w[19], w[20], selector); + w[39] = __byte_perm (w[18], w[19], selector); + w[38] = __byte_perm (w[17], w[18], selector); + w[37] = __byte_perm (w[16], w[17], selector); + w[36] = __byte_perm (w[15], w[16], selector); + w[35] = __byte_perm (w[14], w[15], selector); + w[34] = __byte_perm (w[13], w[14], selector); + w[33] = __byte_perm (w[12], w[13], selector); + w[32] = __byte_perm (w[11], w[12], selector); + w[31] = __byte_perm (w[10], w[11], selector); + w[30] = __byte_perm (w[ 9], w[10], selector); + w[29] = __byte_perm (w[ 8], w[ 9], selector); + w[28] = __byte_perm (w[ 7], w[ 8], selector); + w[27] = __byte_perm (w[ 6], w[ 7], selector); + w[26] = __byte_perm (w[ 5], w[ 6], selector); + w[25] = __byte_perm (w[ 4], w[ 5], selector); + w[24] = __byte_perm (w[ 3], w[ 4], selector); + w[23] = __byte_perm (w[ 2], w[ 3], selector); + w[22] = __byte_perm (w[ 1], w[ 2], selector); + w[21] = __byte_perm (w[ 0], w[ 1], selector); + w[20] = __byte_perm ( 0, w[ 0], selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = __byte_perm (w[41], w[42], selector); + w[62] = __byte_perm (w[40], w[41], selector); + w[61] = __byte_perm (w[39], w[40], selector); + w[60] = __byte_perm (w[38], w[39], selector); + w[59] = __byte_perm (w[37], w[38], selector); + w[58] = __byte_perm (w[36], w[37], selector); + w[57] = __byte_perm (w[35], w[36], selector); + w[56] = __byte_perm (w[34], w[35], selector); + w[55] = __byte_perm (w[33], w[34], selector); + w[54] = __byte_perm (w[32], w[33], selector); + w[53] = __byte_perm (w[31], w[32], selector); + w[52] = __byte_perm (w[30], w[31], selector); + w[51] = __byte_perm (w[29], w[30], selector); + w[50] = __byte_perm (w[28], w[29], selector); + w[49] = __byte_perm (w[27], w[28], selector); + w[48] = __byte_perm (w[26], w[27], selector); + w[47] = __byte_perm (w[25], w[26], selector); + w[46] = __byte_perm (w[24], w[25], selector); + w[45] = __byte_perm (w[23], w[24], selector); + w[44] = __byte_perm (w[22], w[23], selector); + w[43] = __byte_perm (w[21], w[22], selector); + w[42] = __byte_perm (w[20], w[21], selector); + w[41] = __byte_perm (w[19], w[20], selector); + w[40] = __byte_perm (w[18], w[19], selector); + w[39] = __byte_perm (w[17], w[18], selector); + w[38] = __byte_perm (w[16], w[17], selector); + w[37] = __byte_perm (w[15], w[16], selector); + w[36] = __byte_perm (w[14], w[15], selector); + w[35] = __byte_perm (w[13], w[14], selector); + w[34] = __byte_perm (w[12], w[13], selector); + w[33] = __byte_perm (w[11], w[12], selector); + w[32] = __byte_perm (w[10], w[11], selector); + w[31] = __byte_perm (w[ 9], w[10], selector); + w[30] = __byte_perm (w[ 8], w[ 9], selector); + w[29] = __byte_perm (w[ 7], w[ 8], selector); + w[28] = __byte_perm (w[ 6], w[ 7], selector); + w[27] = __byte_perm (w[ 5], w[ 6], selector); + w[26] = __byte_perm (w[ 4], w[ 5], selector); + w[25] = __byte_perm (w[ 3], w[ 4], selector); + w[24] = __byte_perm (w[ 2], w[ 3], selector); + w[23] = __byte_perm (w[ 1], w[ 2], selector); + w[22] = __byte_perm (w[ 0], w[ 1], selector); + w[21] = __byte_perm ( 0, w[ 0], selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = __byte_perm (w[40], w[41], selector); + w[62] = __byte_perm (w[39], w[40], selector); + w[61] = __byte_perm (w[38], w[39], selector); + w[60] = __byte_perm (w[37], w[38], selector); + w[59] = __byte_perm (w[36], w[37], selector); + w[58] = __byte_perm (w[35], w[36], selector); + w[57] = __byte_perm (w[34], w[35], selector); + w[56] = __byte_perm (w[33], w[34], selector); + w[55] = __byte_perm (w[32], w[33], selector); + w[54] = __byte_perm (w[31], w[32], selector); + w[53] = __byte_perm (w[30], w[31], selector); + w[52] = __byte_perm (w[29], w[30], selector); + w[51] = __byte_perm (w[28], w[29], selector); + w[50] = __byte_perm (w[27], w[28], selector); + w[49] = __byte_perm (w[26], w[27], selector); + w[48] = __byte_perm (w[25], w[26], selector); + w[47] = __byte_perm (w[24], w[25], selector); + w[46] = __byte_perm (w[23], w[24], selector); + w[45] = __byte_perm (w[22], w[23], selector); + w[44] = __byte_perm (w[21], w[22], selector); + w[43] = __byte_perm (w[20], w[21], selector); + w[42] = __byte_perm (w[19], w[20], selector); + w[41] = __byte_perm (w[18], w[19], selector); + w[40] = __byte_perm (w[17], w[18], selector); + w[39] = __byte_perm (w[16], w[17], selector); + w[38] = __byte_perm (w[15], w[16], selector); + w[37] = __byte_perm (w[14], w[15], selector); + w[36] = __byte_perm (w[13], w[14], selector); + w[35] = __byte_perm (w[12], w[13], selector); + w[34] = __byte_perm (w[11], w[12], selector); + w[33] = __byte_perm (w[10], w[11], selector); + w[32] = __byte_perm (w[ 9], w[10], selector); + w[31] = __byte_perm (w[ 8], w[ 9], selector); + w[30] = __byte_perm (w[ 7], w[ 8], selector); + w[29] = __byte_perm (w[ 6], w[ 7], selector); + w[28] = __byte_perm (w[ 5], w[ 6], selector); + w[27] = __byte_perm (w[ 4], w[ 5], selector); + w[26] = __byte_perm (w[ 3], w[ 4], selector); + w[25] = __byte_perm (w[ 2], w[ 3], selector); + w[24] = __byte_perm (w[ 1], w[ 2], selector); + w[23] = __byte_perm (w[ 0], w[ 1], selector); + w[22] = __byte_perm ( 0, w[ 0], selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = __byte_perm (w[39], w[40], selector); + w[62] = __byte_perm (w[38], w[39], selector); + w[61] = __byte_perm (w[37], w[38], selector); + w[60] = __byte_perm (w[36], w[37], selector); + w[59] = __byte_perm (w[35], w[36], selector); + w[58] = __byte_perm (w[34], w[35], selector); + w[57] = __byte_perm (w[33], w[34], selector); + w[56] = __byte_perm (w[32], w[33], selector); + w[55] = __byte_perm (w[31], w[32], selector); + w[54] = __byte_perm (w[30], w[31], selector); + w[53] = __byte_perm (w[29], w[30], selector); + w[52] = __byte_perm (w[28], w[29], selector); + w[51] = __byte_perm (w[27], w[28], selector); + w[50] = __byte_perm (w[26], w[27], selector); + w[49] = __byte_perm (w[25], w[26], selector); + w[48] = __byte_perm (w[24], w[25], selector); + w[47] = __byte_perm (w[23], w[24], selector); + w[46] = __byte_perm (w[22], w[23], selector); + w[45] = __byte_perm (w[21], w[22], selector); + w[44] = __byte_perm (w[20], w[21], selector); + w[43] = __byte_perm (w[19], w[20], selector); + w[42] = __byte_perm (w[18], w[19], selector); + w[41] = __byte_perm (w[17], w[18], selector); + w[40] = __byte_perm (w[16], w[17], selector); + w[39] = __byte_perm (w[15], w[16], selector); + w[38] = __byte_perm (w[14], w[15], selector); + w[37] = __byte_perm (w[13], w[14], selector); + w[36] = __byte_perm (w[12], w[13], selector); + w[35] = __byte_perm (w[11], w[12], selector); + w[34] = __byte_perm (w[10], w[11], selector); + w[33] = __byte_perm (w[ 9], w[10], selector); + w[32] = __byte_perm (w[ 8], w[ 9], selector); + w[31] = __byte_perm (w[ 7], w[ 8], selector); + w[30] = __byte_perm (w[ 6], w[ 7], selector); + w[29] = __byte_perm (w[ 5], w[ 6], selector); + w[28] = __byte_perm (w[ 4], w[ 5], selector); + w[27] = __byte_perm (w[ 3], w[ 4], selector); + w[26] = __byte_perm (w[ 2], w[ 3], selector); + w[25] = __byte_perm (w[ 1], w[ 2], selector); + w[24] = __byte_perm (w[ 0], w[ 1], selector); + w[23] = __byte_perm ( 0, w[ 0], selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = __byte_perm (w[38], w[39], selector); + w[62] = __byte_perm (w[37], w[38], selector); + w[61] = __byte_perm (w[36], w[37], selector); + w[60] = __byte_perm (w[35], w[36], selector); + w[59] = __byte_perm (w[34], w[35], selector); + w[58] = __byte_perm (w[33], w[34], selector); + w[57] = __byte_perm (w[32], w[33], selector); + w[56] = __byte_perm (w[31], w[32], selector); + w[55] = __byte_perm (w[30], w[31], selector); + w[54] = __byte_perm (w[29], w[30], selector); + w[53] = __byte_perm (w[28], w[29], selector); + w[52] = __byte_perm (w[27], w[28], selector); + w[51] = __byte_perm (w[26], w[27], selector); + w[50] = __byte_perm (w[25], w[26], selector); + w[49] = __byte_perm (w[24], w[25], selector); + w[48] = __byte_perm (w[23], w[24], selector); + w[47] = __byte_perm (w[22], w[23], selector); + w[46] = __byte_perm (w[21], w[22], selector); + w[45] = __byte_perm (w[20], w[21], selector); + w[44] = __byte_perm (w[19], w[20], selector); + w[43] = __byte_perm (w[18], w[19], selector); + w[42] = __byte_perm (w[17], w[18], selector); + w[41] = __byte_perm (w[16], w[17], selector); + w[40] = __byte_perm (w[15], w[16], selector); + w[39] = __byte_perm (w[14], w[15], selector); + w[38] = __byte_perm (w[13], w[14], selector); + w[37] = __byte_perm (w[12], w[13], selector); + w[36] = __byte_perm (w[11], w[12], selector); + w[35] = __byte_perm (w[10], w[11], selector); + w[34] = __byte_perm (w[ 9], w[10], selector); + w[33] = __byte_perm (w[ 8], w[ 9], selector); + w[32] = __byte_perm (w[ 7], w[ 8], selector); + w[31] = __byte_perm (w[ 6], w[ 7], selector); + w[30] = __byte_perm (w[ 5], w[ 6], selector); + w[29] = __byte_perm (w[ 4], w[ 5], selector); + w[28] = __byte_perm (w[ 3], w[ 4], selector); + w[27] = __byte_perm (w[ 2], w[ 3], selector); + w[26] = __byte_perm (w[ 1], w[ 2], selector); + w[25] = __byte_perm (w[ 0], w[ 1], selector); + w[24] = __byte_perm ( 0, w[ 0], selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = __byte_perm (w[37], w[38], selector); + w[62] = __byte_perm (w[36], w[37], selector); + w[61] = __byte_perm (w[35], w[36], selector); + w[60] = __byte_perm (w[34], w[35], selector); + w[59] = __byte_perm (w[33], w[34], selector); + w[58] = __byte_perm (w[32], w[33], selector); + w[57] = __byte_perm (w[31], w[32], selector); + w[56] = __byte_perm (w[30], w[31], selector); + w[55] = __byte_perm (w[29], w[30], selector); + w[54] = __byte_perm (w[28], w[29], selector); + w[53] = __byte_perm (w[27], w[28], selector); + w[52] = __byte_perm (w[26], w[27], selector); + w[51] = __byte_perm (w[25], w[26], selector); + w[50] = __byte_perm (w[24], w[25], selector); + w[49] = __byte_perm (w[23], w[24], selector); + w[48] = __byte_perm (w[22], w[23], selector); + w[47] = __byte_perm (w[21], w[22], selector); + w[46] = __byte_perm (w[20], w[21], selector); + w[45] = __byte_perm (w[19], w[20], selector); + w[44] = __byte_perm (w[18], w[19], selector); + w[43] = __byte_perm (w[17], w[18], selector); + w[42] = __byte_perm (w[16], w[17], selector); + w[41] = __byte_perm (w[15], w[16], selector); + w[40] = __byte_perm (w[14], w[15], selector); + w[39] = __byte_perm (w[13], w[14], selector); + w[38] = __byte_perm (w[12], w[13], selector); + w[37] = __byte_perm (w[11], w[12], selector); + w[36] = __byte_perm (w[10], w[11], selector); + w[35] = __byte_perm (w[ 9], w[10], selector); + w[34] = __byte_perm (w[ 8], w[ 9], selector); + w[33] = __byte_perm (w[ 7], w[ 8], selector); + w[32] = __byte_perm (w[ 6], w[ 7], selector); + w[31] = __byte_perm (w[ 5], w[ 6], selector); + w[30] = __byte_perm (w[ 4], w[ 5], selector); + w[29] = __byte_perm (w[ 3], w[ 4], selector); + w[28] = __byte_perm (w[ 2], w[ 3], selector); + w[27] = __byte_perm (w[ 1], w[ 2], selector); + w[26] = __byte_perm (w[ 0], w[ 1], selector); + w[25] = __byte_perm ( 0, w[ 0], selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = __byte_perm (w[36], w[37], selector); + w[62] = __byte_perm (w[35], w[36], selector); + w[61] = __byte_perm (w[34], w[35], selector); + w[60] = __byte_perm (w[33], w[34], selector); + w[59] = __byte_perm (w[32], w[33], selector); + w[58] = __byte_perm (w[31], w[32], selector); + w[57] = __byte_perm (w[30], w[31], selector); + w[56] = __byte_perm (w[29], w[30], selector); + w[55] = __byte_perm (w[28], w[29], selector); + w[54] = __byte_perm (w[27], w[28], selector); + w[53] = __byte_perm (w[26], w[27], selector); + w[52] = __byte_perm (w[25], w[26], selector); + w[51] = __byte_perm (w[24], w[25], selector); + w[50] = __byte_perm (w[23], w[24], selector); + w[49] = __byte_perm (w[22], w[23], selector); + w[48] = __byte_perm (w[21], w[22], selector); + w[47] = __byte_perm (w[20], w[21], selector); + w[46] = __byte_perm (w[19], w[20], selector); + w[45] = __byte_perm (w[18], w[19], selector); + w[44] = __byte_perm (w[17], w[18], selector); + w[43] = __byte_perm (w[16], w[17], selector); + w[42] = __byte_perm (w[15], w[16], selector); + w[41] = __byte_perm (w[14], w[15], selector); + w[40] = __byte_perm (w[13], w[14], selector); + w[39] = __byte_perm (w[12], w[13], selector); + w[38] = __byte_perm (w[11], w[12], selector); + w[37] = __byte_perm (w[10], w[11], selector); + w[36] = __byte_perm (w[ 9], w[10], selector); + w[35] = __byte_perm (w[ 8], w[ 9], selector); + w[34] = __byte_perm (w[ 7], w[ 8], selector); + w[33] = __byte_perm (w[ 6], w[ 7], selector); + w[32] = __byte_perm (w[ 5], w[ 6], selector); + w[31] = __byte_perm (w[ 4], w[ 5], selector); + w[30] = __byte_perm (w[ 3], w[ 4], selector); + w[29] = __byte_perm (w[ 2], w[ 3], selector); + w[28] = __byte_perm (w[ 1], w[ 2], selector); + w[27] = __byte_perm (w[ 0], w[ 1], selector); + w[26] = __byte_perm ( 0, w[ 0], selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = __byte_perm (w[35], w[36], selector); + w[62] = __byte_perm (w[34], w[35], selector); + w[61] = __byte_perm (w[33], w[34], selector); + w[60] = __byte_perm (w[32], w[33], selector); + w[59] = __byte_perm (w[31], w[32], selector); + w[58] = __byte_perm (w[30], w[31], selector); + w[57] = __byte_perm (w[29], w[30], selector); + w[56] = __byte_perm (w[28], w[29], selector); + w[55] = __byte_perm (w[27], w[28], selector); + w[54] = __byte_perm (w[26], w[27], selector); + w[53] = __byte_perm (w[25], w[26], selector); + w[52] = __byte_perm (w[24], w[25], selector); + w[51] = __byte_perm (w[23], w[24], selector); + w[50] = __byte_perm (w[22], w[23], selector); + w[49] = __byte_perm (w[21], w[22], selector); + w[48] = __byte_perm (w[20], w[21], selector); + w[47] = __byte_perm (w[19], w[20], selector); + w[46] = __byte_perm (w[18], w[19], selector); + w[45] = __byte_perm (w[17], w[18], selector); + w[44] = __byte_perm (w[16], w[17], selector); + w[43] = __byte_perm (w[15], w[16], selector); + w[42] = __byte_perm (w[14], w[15], selector); + w[41] = __byte_perm (w[13], w[14], selector); + w[40] = __byte_perm (w[12], w[13], selector); + w[39] = __byte_perm (w[11], w[12], selector); + w[38] = __byte_perm (w[10], w[11], selector); + w[37] = __byte_perm (w[ 9], w[10], selector); + w[36] = __byte_perm (w[ 8], w[ 9], selector); + w[35] = __byte_perm (w[ 7], w[ 8], selector); + w[34] = __byte_perm (w[ 6], w[ 7], selector); + w[33] = __byte_perm (w[ 5], w[ 6], selector); + w[32] = __byte_perm (w[ 4], w[ 5], selector); + w[31] = __byte_perm (w[ 3], w[ 4], selector); + w[30] = __byte_perm (w[ 2], w[ 3], selector); + w[29] = __byte_perm (w[ 1], w[ 2], selector); + w[28] = __byte_perm (w[ 0], w[ 1], selector); + w[27] = __byte_perm ( 0, w[ 0], selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = __byte_perm (w[34], w[35], selector); + w[62] = __byte_perm (w[33], w[34], selector); + w[61] = __byte_perm (w[32], w[33], selector); + w[60] = __byte_perm (w[31], w[32], selector); + w[59] = __byte_perm (w[30], w[31], selector); + w[58] = __byte_perm (w[29], w[30], selector); + w[57] = __byte_perm (w[28], w[29], selector); + w[56] = __byte_perm (w[27], w[28], selector); + w[55] = __byte_perm (w[26], w[27], selector); + w[54] = __byte_perm (w[25], w[26], selector); + w[53] = __byte_perm (w[24], w[25], selector); + w[52] = __byte_perm (w[23], w[24], selector); + w[51] = __byte_perm (w[22], w[23], selector); + w[50] = __byte_perm (w[21], w[22], selector); + w[49] = __byte_perm (w[20], w[21], selector); + w[48] = __byte_perm (w[19], w[20], selector); + w[47] = __byte_perm (w[18], w[19], selector); + w[46] = __byte_perm (w[17], w[18], selector); + w[45] = __byte_perm (w[16], w[17], selector); + w[44] = __byte_perm (w[15], w[16], selector); + w[43] = __byte_perm (w[14], w[15], selector); + w[42] = __byte_perm (w[13], w[14], selector); + w[41] = __byte_perm (w[12], w[13], selector); + w[40] = __byte_perm (w[11], w[12], selector); + w[39] = __byte_perm (w[10], w[11], selector); + w[38] = __byte_perm (w[ 9], w[10], selector); + w[37] = __byte_perm (w[ 8], w[ 9], selector); + w[36] = __byte_perm (w[ 7], w[ 8], selector); + w[35] = __byte_perm (w[ 6], w[ 7], selector); + w[34] = __byte_perm (w[ 5], w[ 6], selector); + w[33] = __byte_perm (w[ 4], w[ 5], selector); + w[32] = __byte_perm (w[ 3], w[ 4], selector); + w[31] = __byte_perm (w[ 2], w[ 3], selector); + w[30] = __byte_perm (w[ 1], w[ 2], selector); + w[29] = __byte_perm (w[ 0], w[ 1], selector); + w[28] = __byte_perm ( 0, w[ 0], selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = __byte_perm (w[33], w[34], selector); + w[62] = __byte_perm (w[32], w[33], selector); + w[61] = __byte_perm (w[31], w[32], selector); + w[60] = __byte_perm (w[30], w[31], selector); + w[59] = __byte_perm (w[29], w[30], selector); + w[58] = __byte_perm (w[28], w[29], selector); + w[57] = __byte_perm (w[27], w[28], selector); + w[56] = __byte_perm (w[26], w[27], selector); + w[55] = __byte_perm (w[25], w[26], selector); + w[54] = __byte_perm (w[24], w[25], selector); + w[53] = __byte_perm (w[23], w[24], selector); + w[52] = __byte_perm (w[22], w[23], selector); + w[51] = __byte_perm (w[21], w[22], selector); + w[50] = __byte_perm (w[20], w[21], selector); + w[49] = __byte_perm (w[19], w[20], selector); + w[48] = __byte_perm (w[18], w[19], selector); + w[47] = __byte_perm (w[17], w[18], selector); + w[46] = __byte_perm (w[16], w[17], selector); + w[45] = __byte_perm (w[15], w[16], selector); + w[44] = __byte_perm (w[14], w[15], selector); + w[43] = __byte_perm (w[13], w[14], selector); + w[42] = __byte_perm (w[12], w[13], selector); + w[41] = __byte_perm (w[11], w[12], selector); + w[40] = __byte_perm (w[10], w[11], selector); + w[39] = __byte_perm (w[ 9], w[10], selector); + w[38] = __byte_perm (w[ 8], w[ 9], selector); + w[37] = __byte_perm (w[ 7], w[ 8], selector); + w[36] = __byte_perm (w[ 6], w[ 7], selector); + w[35] = __byte_perm (w[ 5], w[ 6], selector); + w[34] = __byte_perm (w[ 4], w[ 5], selector); + w[33] = __byte_perm (w[ 3], w[ 4], selector); + w[32] = __byte_perm (w[ 2], w[ 3], selector); + w[31] = __byte_perm (w[ 1], w[ 2], selector); + w[30] = __byte_perm (w[ 0], w[ 1], selector); + w[29] = __byte_perm ( 0, w[ 0], selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = __byte_perm (w[32], w[33], selector); + w[62] = __byte_perm (w[31], w[32], selector); + w[61] = __byte_perm (w[30], w[31], selector); + w[60] = __byte_perm (w[29], w[30], selector); + w[59] = __byte_perm (w[28], w[29], selector); + w[58] = __byte_perm (w[27], w[28], selector); + w[57] = __byte_perm (w[26], w[27], selector); + w[56] = __byte_perm (w[25], w[26], selector); + w[55] = __byte_perm (w[24], w[25], selector); + w[54] = __byte_perm (w[23], w[24], selector); + w[53] = __byte_perm (w[22], w[23], selector); + w[52] = __byte_perm (w[21], w[22], selector); + w[51] = __byte_perm (w[20], w[21], selector); + w[50] = __byte_perm (w[19], w[20], selector); + w[49] = __byte_perm (w[18], w[19], selector); + w[48] = __byte_perm (w[17], w[18], selector); + w[47] = __byte_perm (w[16], w[17], selector); + w[46] = __byte_perm (w[15], w[16], selector); + w[45] = __byte_perm (w[14], w[15], selector); + w[44] = __byte_perm (w[13], w[14], selector); + w[43] = __byte_perm (w[12], w[13], selector); + w[42] = __byte_perm (w[11], w[12], selector); + w[41] = __byte_perm (w[10], w[11], selector); + w[40] = __byte_perm (w[ 9], w[10], selector); + w[39] = __byte_perm (w[ 8], w[ 9], selector); + w[38] = __byte_perm (w[ 7], w[ 8], selector); + w[37] = __byte_perm (w[ 6], w[ 7], selector); + w[36] = __byte_perm (w[ 5], w[ 6], selector); + w[35] = __byte_perm (w[ 4], w[ 5], selector); + w[34] = __byte_perm (w[ 3], w[ 4], selector); + w[33] = __byte_perm (w[ 2], w[ 3], selector); + w[32] = __byte_perm (w[ 1], w[ 2], selector); + w[31] = __byte_perm (w[ 0], w[ 1], selector); + w[30] = __byte_perm ( 0, w[ 0], selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = __byte_perm (w[31], w[32], selector); + w[62] = __byte_perm (w[30], w[31], selector); + w[61] = __byte_perm (w[29], w[30], selector); + w[60] = __byte_perm (w[28], w[29], selector); + w[59] = __byte_perm (w[27], w[28], selector); + w[58] = __byte_perm (w[26], w[27], selector); + w[57] = __byte_perm (w[25], w[26], selector); + w[56] = __byte_perm (w[24], w[25], selector); + w[55] = __byte_perm (w[23], w[24], selector); + w[54] = __byte_perm (w[22], w[23], selector); + w[53] = __byte_perm (w[21], w[22], selector); + w[52] = __byte_perm (w[20], w[21], selector); + w[51] = __byte_perm (w[19], w[20], selector); + w[50] = __byte_perm (w[18], w[19], selector); + w[49] = __byte_perm (w[17], w[18], selector); + w[48] = __byte_perm (w[16], w[17], selector); + w[47] = __byte_perm (w[15], w[16], selector); + w[46] = __byte_perm (w[14], w[15], selector); + w[45] = __byte_perm (w[13], w[14], selector); + w[44] = __byte_perm (w[12], w[13], selector); + w[43] = __byte_perm (w[11], w[12], selector); + w[42] = __byte_perm (w[10], w[11], selector); + w[41] = __byte_perm (w[ 9], w[10], selector); + w[40] = __byte_perm (w[ 8], w[ 9], selector); + w[39] = __byte_perm (w[ 7], w[ 8], selector); + w[38] = __byte_perm (w[ 6], w[ 7], selector); + w[37] = __byte_perm (w[ 5], w[ 6], selector); + w[36] = __byte_perm (w[ 4], w[ 5], selector); + w[35] = __byte_perm (w[ 3], w[ 4], selector); + w[34] = __byte_perm (w[ 2], w[ 3], selector); + w[33] = __byte_perm (w[ 1], w[ 2], selector); + w[32] = __byte_perm (w[ 0], w[ 1], selector); + w[31] = __byte_perm ( 0, w[ 0], selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = __byte_perm (w[30], w[31], selector); + w[62] = __byte_perm (w[29], w[30], selector); + w[61] = __byte_perm (w[28], w[29], selector); + w[60] = __byte_perm (w[27], w[28], selector); + w[59] = __byte_perm (w[26], w[27], selector); + w[58] = __byte_perm (w[25], w[26], selector); + w[57] = __byte_perm (w[24], w[25], selector); + w[56] = __byte_perm (w[23], w[24], selector); + w[55] = __byte_perm (w[22], w[23], selector); + w[54] = __byte_perm (w[21], w[22], selector); + w[53] = __byte_perm (w[20], w[21], selector); + w[52] = __byte_perm (w[19], w[20], selector); + w[51] = __byte_perm (w[18], w[19], selector); + w[50] = __byte_perm (w[17], w[18], selector); + w[49] = __byte_perm (w[16], w[17], selector); + w[48] = __byte_perm (w[15], w[16], selector); + w[47] = __byte_perm (w[14], w[15], selector); + w[46] = __byte_perm (w[13], w[14], selector); + w[45] = __byte_perm (w[12], w[13], selector); + w[44] = __byte_perm (w[11], w[12], selector); + w[43] = __byte_perm (w[10], w[11], selector); + w[42] = __byte_perm (w[ 9], w[10], selector); + w[41] = __byte_perm (w[ 8], w[ 9], selector); + w[40] = __byte_perm (w[ 7], w[ 8], selector); + w[39] = __byte_perm (w[ 6], w[ 7], selector); + w[38] = __byte_perm (w[ 5], w[ 6], selector); + w[37] = __byte_perm (w[ 4], w[ 5], selector); + w[36] = __byte_perm (w[ 3], w[ 4], selector); + w[35] = __byte_perm (w[ 2], w[ 3], selector); + w[34] = __byte_perm (w[ 1], w[ 2], selector); + w[33] = __byte_perm (w[ 0], w[ 1], selector); + w[32] = __byte_perm ( 0, w[ 0], selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = __byte_perm (w[29], w[30], selector); + w[62] = __byte_perm (w[28], w[29], selector); + w[61] = __byte_perm (w[27], w[28], selector); + w[60] = __byte_perm (w[26], w[27], selector); + w[59] = __byte_perm (w[25], w[26], selector); + w[58] = __byte_perm (w[24], w[25], selector); + w[57] = __byte_perm (w[23], w[24], selector); + w[56] = __byte_perm (w[22], w[23], selector); + w[55] = __byte_perm (w[21], w[22], selector); + w[54] = __byte_perm (w[20], w[21], selector); + w[53] = __byte_perm (w[19], w[20], selector); + w[52] = __byte_perm (w[18], w[19], selector); + w[51] = __byte_perm (w[17], w[18], selector); + w[50] = __byte_perm (w[16], w[17], selector); + w[49] = __byte_perm (w[15], w[16], selector); + w[48] = __byte_perm (w[14], w[15], selector); + w[47] = __byte_perm (w[13], w[14], selector); + w[46] = __byte_perm (w[12], w[13], selector); + w[45] = __byte_perm (w[11], w[12], selector); + w[44] = __byte_perm (w[10], w[11], selector); + w[43] = __byte_perm (w[ 9], w[10], selector); + w[42] = __byte_perm (w[ 8], w[ 9], selector); + w[41] = __byte_perm (w[ 7], w[ 8], selector); + w[40] = __byte_perm (w[ 6], w[ 7], selector); + w[39] = __byte_perm (w[ 5], w[ 6], selector); + w[38] = __byte_perm (w[ 4], w[ 5], selector); + w[37] = __byte_perm (w[ 3], w[ 4], selector); + w[36] = __byte_perm (w[ 2], w[ 3], selector); + w[35] = __byte_perm (w[ 1], w[ 2], selector); + w[34] = __byte_perm (w[ 0], w[ 1], selector); + w[33] = __byte_perm ( 0, w[ 0], selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = __byte_perm (w[28], w[29], selector); + w[62] = __byte_perm (w[27], w[28], selector); + w[61] = __byte_perm (w[26], w[27], selector); + w[60] = __byte_perm (w[25], w[26], selector); + w[59] = __byte_perm (w[24], w[25], selector); + w[58] = __byte_perm (w[23], w[24], selector); + w[57] = __byte_perm (w[22], w[23], selector); + w[56] = __byte_perm (w[21], w[22], selector); + w[55] = __byte_perm (w[20], w[21], selector); + w[54] = __byte_perm (w[19], w[20], selector); + w[53] = __byte_perm (w[18], w[19], selector); + w[52] = __byte_perm (w[17], w[18], selector); + w[51] = __byte_perm (w[16], w[17], selector); + w[50] = __byte_perm (w[15], w[16], selector); + w[49] = __byte_perm (w[14], w[15], selector); + w[48] = __byte_perm (w[13], w[14], selector); + w[47] = __byte_perm (w[12], w[13], selector); + w[46] = __byte_perm (w[11], w[12], selector); + w[45] = __byte_perm (w[10], w[11], selector); + w[44] = __byte_perm (w[ 9], w[10], selector); + w[43] = __byte_perm (w[ 8], w[ 9], selector); + w[42] = __byte_perm (w[ 7], w[ 8], selector); + w[41] = __byte_perm (w[ 6], w[ 7], selector); + w[40] = __byte_perm (w[ 5], w[ 6], selector); + w[39] = __byte_perm (w[ 4], w[ 5], selector); + w[38] = __byte_perm (w[ 3], w[ 4], selector); + w[37] = __byte_perm (w[ 2], w[ 3], selector); + w[36] = __byte_perm (w[ 1], w[ 2], selector); + w[35] = __byte_perm (w[ 0], w[ 1], selector); + w[34] = __byte_perm ( 0, w[ 0], selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = __byte_perm (w[27], w[28], selector); + w[62] = __byte_perm (w[26], w[27], selector); + w[61] = __byte_perm (w[25], w[26], selector); + w[60] = __byte_perm (w[24], w[25], selector); + w[59] = __byte_perm (w[23], w[24], selector); + w[58] = __byte_perm (w[22], w[23], selector); + w[57] = __byte_perm (w[21], w[22], selector); + w[56] = __byte_perm (w[20], w[21], selector); + w[55] = __byte_perm (w[19], w[20], selector); + w[54] = __byte_perm (w[18], w[19], selector); + w[53] = __byte_perm (w[17], w[18], selector); + w[52] = __byte_perm (w[16], w[17], selector); + w[51] = __byte_perm (w[15], w[16], selector); + w[50] = __byte_perm (w[14], w[15], selector); + w[49] = __byte_perm (w[13], w[14], selector); + w[48] = __byte_perm (w[12], w[13], selector); + w[47] = __byte_perm (w[11], w[12], selector); + w[46] = __byte_perm (w[10], w[11], selector); + w[45] = __byte_perm (w[ 9], w[10], selector); + w[44] = __byte_perm (w[ 8], w[ 9], selector); + w[43] = __byte_perm (w[ 7], w[ 8], selector); + w[42] = __byte_perm (w[ 6], w[ 7], selector); + w[41] = __byte_perm (w[ 5], w[ 6], selector); + w[40] = __byte_perm (w[ 4], w[ 5], selector); + w[39] = __byte_perm (w[ 3], w[ 4], selector); + w[38] = __byte_perm (w[ 2], w[ 3], selector); + w[37] = __byte_perm (w[ 1], w[ 2], selector); + w[36] = __byte_perm (w[ 0], w[ 1], selector); + w[35] = __byte_perm ( 0, w[ 0], selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = __byte_perm (w[26], w[27], selector); + w[62] = __byte_perm (w[25], w[26], selector); + w[61] = __byte_perm (w[24], w[25], selector); + w[60] = __byte_perm (w[23], w[24], selector); + w[59] = __byte_perm (w[22], w[23], selector); + w[58] = __byte_perm (w[21], w[22], selector); + w[57] = __byte_perm (w[20], w[21], selector); + w[56] = __byte_perm (w[19], w[20], selector); + w[55] = __byte_perm (w[18], w[19], selector); + w[54] = __byte_perm (w[17], w[18], selector); + w[53] = __byte_perm (w[16], w[17], selector); + w[52] = __byte_perm (w[15], w[16], selector); + w[51] = __byte_perm (w[14], w[15], selector); + w[50] = __byte_perm (w[13], w[14], selector); + w[49] = __byte_perm (w[12], w[13], selector); + w[48] = __byte_perm (w[11], w[12], selector); + w[47] = __byte_perm (w[10], w[11], selector); + w[46] = __byte_perm (w[ 9], w[10], selector); + w[45] = __byte_perm (w[ 8], w[ 9], selector); + w[44] = __byte_perm (w[ 7], w[ 8], selector); + w[43] = __byte_perm (w[ 6], w[ 7], selector); + w[42] = __byte_perm (w[ 5], w[ 6], selector); + w[41] = __byte_perm (w[ 4], w[ 5], selector); + w[40] = __byte_perm (w[ 3], w[ 4], selector); + w[39] = __byte_perm (w[ 2], w[ 3], selector); + w[38] = __byte_perm (w[ 1], w[ 2], selector); + w[37] = __byte_perm (w[ 0], w[ 1], selector); + w[36] = __byte_perm ( 0, w[ 0], selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = __byte_perm (w[25], w[26], selector); + w[62] = __byte_perm (w[24], w[25], selector); + w[61] = __byte_perm (w[23], w[24], selector); + w[60] = __byte_perm (w[22], w[23], selector); + w[59] = __byte_perm (w[21], w[22], selector); + w[58] = __byte_perm (w[20], w[21], selector); + w[57] = __byte_perm (w[19], w[20], selector); + w[56] = __byte_perm (w[18], w[19], selector); + w[55] = __byte_perm (w[17], w[18], selector); + w[54] = __byte_perm (w[16], w[17], selector); + w[53] = __byte_perm (w[15], w[16], selector); + w[52] = __byte_perm (w[14], w[15], selector); + w[51] = __byte_perm (w[13], w[14], selector); + w[50] = __byte_perm (w[12], w[13], selector); + w[49] = __byte_perm (w[11], w[12], selector); + w[48] = __byte_perm (w[10], w[11], selector); + w[47] = __byte_perm (w[ 9], w[10], selector); + w[46] = __byte_perm (w[ 8], w[ 9], selector); + w[45] = __byte_perm (w[ 7], w[ 8], selector); + w[44] = __byte_perm (w[ 6], w[ 7], selector); + w[43] = __byte_perm (w[ 5], w[ 6], selector); + w[42] = __byte_perm (w[ 4], w[ 5], selector); + w[41] = __byte_perm (w[ 3], w[ 4], selector); + w[40] = __byte_perm (w[ 2], w[ 3], selector); + w[39] = __byte_perm (w[ 1], w[ 2], selector); + w[38] = __byte_perm (w[ 0], w[ 1], selector); + w[37] = __byte_perm ( 0, w[ 0], selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = __byte_perm (w[24], w[25], selector); + w[62] = __byte_perm (w[23], w[24], selector); + w[61] = __byte_perm (w[22], w[23], selector); + w[60] = __byte_perm (w[21], w[22], selector); + w[59] = __byte_perm (w[20], w[21], selector); + w[58] = __byte_perm (w[19], w[20], selector); + w[57] = __byte_perm (w[18], w[19], selector); + w[56] = __byte_perm (w[17], w[18], selector); + w[55] = __byte_perm (w[16], w[17], selector); + w[54] = __byte_perm (w[15], w[16], selector); + w[53] = __byte_perm (w[14], w[15], selector); + w[52] = __byte_perm (w[13], w[14], selector); + w[51] = __byte_perm (w[12], w[13], selector); + w[50] = __byte_perm (w[11], w[12], selector); + w[49] = __byte_perm (w[10], w[11], selector); + w[48] = __byte_perm (w[ 9], w[10], selector); + w[47] = __byte_perm (w[ 8], w[ 9], selector); + w[46] = __byte_perm (w[ 7], w[ 8], selector); + w[45] = __byte_perm (w[ 6], w[ 7], selector); + w[44] = __byte_perm (w[ 5], w[ 6], selector); + w[43] = __byte_perm (w[ 4], w[ 5], selector); + w[42] = __byte_perm (w[ 3], w[ 4], selector); + w[41] = __byte_perm (w[ 2], w[ 3], selector); + w[40] = __byte_perm (w[ 1], w[ 2], selector); + w[39] = __byte_perm (w[ 0], w[ 1], selector); + w[38] = __byte_perm ( 0, w[ 0], selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = __byte_perm (w[23], w[24], selector); + w[62] = __byte_perm (w[22], w[23], selector); + w[61] = __byte_perm (w[21], w[22], selector); + w[60] = __byte_perm (w[20], w[21], selector); + w[59] = __byte_perm (w[19], w[20], selector); + w[58] = __byte_perm (w[18], w[19], selector); + w[57] = __byte_perm (w[17], w[18], selector); + w[56] = __byte_perm (w[16], w[17], selector); + w[55] = __byte_perm (w[15], w[16], selector); + w[54] = __byte_perm (w[14], w[15], selector); + w[53] = __byte_perm (w[13], w[14], selector); + w[52] = __byte_perm (w[12], w[13], selector); + w[51] = __byte_perm (w[11], w[12], selector); + w[50] = __byte_perm (w[10], w[11], selector); + w[49] = __byte_perm (w[ 9], w[10], selector); + w[48] = __byte_perm (w[ 8], w[ 9], selector); + w[47] = __byte_perm (w[ 7], w[ 8], selector); + w[46] = __byte_perm (w[ 6], w[ 7], selector); + w[45] = __byte_perm (w[ 5], w[ 6], selector); + w[44] = __byte_perm (w[ 4], w[ 5], selector); + w[43] = __byte_perm (w[ 3], w[ 4], selector); + w[42] = __byte_perm (w[ 2], w[ 3], selector); + w[41] = __byte_perm (w[ 1], w[ 2], selector); + w[40] = __byte_perm (w[ 0], w[ 1], selector); + w[39] = __byte_perm ( 0, w[ 0], selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = __byte_perm (w[22], w[23], selector); + w[62] = __byte_perm (w[21], w[22], selector); + w[61] = __byte_perm (w[20], w[21], selector); + w[60] = __byte_perm (w[19], w[20], selector); + w[59] = __byte_perm (w[18], w[19], selector); + w[58] = __byte_perm (w[17], w[18], selector); + w[57] = __byte_perm (w[16], w[17], selector); + w[56] = __byte_perm (w[15], w[16], selector); + w[55] = __byte_perm (w[14], w[15], selector); + w[54] = __byte_perm (w[13], w[14], selector); + w[53] = __byte_perm (w[12], w[13], selector); + w[52] = __byte_perm (w[11], w[12], selector); + w[51] = __byte_perm (w[10], w[11], selector); + w[50] = __byte_perm (w[ 9], w[10], selector); + w[49] = __byte_perm (w[ 8], w[ 9], selector); + w[48] = __byte_perm (w[ 7], w[ 8], selector); + w[47] = __byte_perm (w[ 6], w[ 7], selector); + w[46] = __byte_perm (w[ 5], w[ 6], selector); + w[45] = __byte_perm (w[ 4], w[ 5], selector); + w[44] = __byte_perm (w[ 3], w[ 4], selector); + w[43] = __byte_perm (w[ 2], w[ 3], selector); + w[42] = __byte_perm (w[ 1], w[ 2], selector); + w[41] = __byte_perm (w[ 0], w[ 1], selector); + w[40] = __byte_perm ( 0, w[ 0], selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = __byte_perm (w[21], w[22], selector); + w[62] = __byte_perm (w[20], w[21], selector); + w[61] = __byte_perm (w[19], w[20], selector); + w[60] = __byte_perm (w[18], w[19], selector); + w[59] = __byte_perm (w[17], w[18], selector); + w[58] = __byte_perm (w[16], w[17], selector); + w[57] = __byte_perm (w[15], w[16], selector); + w[56] = __byte_perm (w[14], w[15], selector); + w[55] = __byte_perm (w[13], w[14], selector); + w[54] = __byte_perm (w[12], w[13], selector); + w[53] = __byte_perm (w[11], w[12], selector); + w[52] = __byte_perm (w[10], w[11], selector); + w[51] = __byte_perm (w[ 9], w[10], selector); + w[50] = __byte_perm (w[ 8], w[ 9], selector); + w[49] = __byte_perm (w[ 7], w[ 8], selector); + w[48] = __byte_perm (w[ 6], w[ 7], selector); + w[47] = __byte_perm (w[ 5], w[ 6], selector); + w[46] = __byte_perm (w[ 4], w[ 5], selector); + w[45] = __byte_perm (w[ 3], w[ 4], selector); + w[44] = __byte_perm (w[ 2], w[ 3], selector); + w[43] = __byte_perm (w[ 1], w[ 2], selector); + w[42] = __byte_perm (w[ 0], w[ 1], selector); + w[41] = __byte_perm ( 0, w[ 0], selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = __byte_perm (w[20], w[21], selector); + w[62] = __byte_perm (w[19], w[20], selector); + w[61] = __byte_perm (w[18], w[19], selector); + w[60] = __byte_perm (w[17], w[18], selector); + w[59] = __byte_perm (w[16], w[17], selector); + w[58] = __byte_perm (w[15], w[16], selector); + w[57] = __byte_perm (w[14], w[15], selector); + w[56] = __byte_perm (w[13], w[14], selector); + w[55] = __byte_perm (w[12], w[13], selector); + w[54] = __byte_perm (w[11], w[12], selector); + w[53] = __byte_perm (w[10], w[11], selector); + w[52] = __byte_perm (w[ 9], w[10], selector); + w[51] = __byte_perm (w[ 8], w[ 9], selector); + w[50] = __byte_perm (w[ 7], w[ 8], selector); + w[49] = __byte_perm (w[ 6], w[ 7], selector); + w[48] = __byte_perm (w[ 5], w[ 6], selector); + w[47] = __byte_perm (w[ 4], w[ 5], selector); + w[46] = __byte_perm (w[ 3], w[ 4], selector); + w[45] = __byte_perm (w[ 2], w[ 3], selector); + w[44] = __byte_perm (w[ 1], w[ 2], selector); + w[43] = __byte_perm (w[ 0], w[ 1], selector); + w[42] = __byte_perm ( 0, w[ 0], selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = __byte_perm (w[19], w[20], selector); + w[62] = __byte_perm (w[18], w[19], selector); + w[61] = __byte_perm (w[17], w[18], selector); + w[60] = __byte_perm (w[16], w[17], selector); + w[59] = __byte_perm (w[15], w[16], selector); + w[58] = __byte_perm (w[14], w[15], selector); + w[57] = __byte_perm (w[13], w[14], selector); + w[56] = __byte_perm (w[12], w[13], selector); + w[55] = __byte_perm (w[11], w[12], selector); + w[54] = __byte_perm (w[10], w[11], selector); + w[53] = __byte_perm (w[ 9], w[10], selector); + w[52] = __byte_perm (w[ 8], w[ 9], selector); + w[51] = __byte_perm (w[ 7], w[ 8], selector); + w[50] = __byte_perm (w[ 6], w[ 7], selector); + w[49] = __byte_perm (w[ 5], w[ 6], selector); + w[48] = __byte_perm (w[ 4], w[ 5], selector); + w[47] = __byte_perm (w[ 3], w[ 4], selector); + w[46] = __byte_perm (w[ 2], w[ 3], selector); + w[45] = __byte_perm (w[ 1], w[ 2], selector); + w[44] = __byte_perm (w[ 0], w[ 1], selector); + w[43] = __byte_perm ( 0, w[ 0], selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = __byte_perm (w[18], w[19], selector); + w[62] = __byte_perm (w[17], w[18], selector); + w[61] = __byte_perm (w[16], w[17], selector); + w[60] = __byte_perm (w[15], w[16], selector); + w[59] = __byte_perm (w[14], w[15], selector); + w[58] = __byte_perm (w[13], w[14], selector); + w[57] = __byte_perm (w[12], w[13], selector); + w[56] = __byte_perm (w[11], w[12], selector); + w[55] = __byte_perm (w[10], w[11], selector); + w[54] = __byte_perm (w[ 9], w[10], selector); + w[53] = __byte_perm (w[ 8], w[ 9], selector); + w[52] = __byte_perm (w[ 7], w[ 8], selector); + w[51] = __byte_perm (w[ 6], w[ 7], selector); + w[50] = __byte_perm (w[ 5], w[ 6], selector); + w[49] = __byte_perm (w[ 4], w[ 5], selector); + w[48] = __byte_perm (w[ 3], w[ 4], selector); + w[47] = __byte_perm (w[ 2], w[ 3], selector); + w[46] = __byte_perm (w[ 1], w[ 2], selector); + w[45] = __byte_perm (w[ 0], w[ 1], selector); + w[44] = __byte_perm ( 0, w[ 0], selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = __byte_perm (w[17], w[18], selector); + w[62] = __byte_perm (w[16], w[17], selector); + w[61] = __byte_perm (w[15], w[16], selector); + w[60] = __byte_perm (w[14], w[15], selector); + w[59] = __byte_perm (w[13], w[14], selector); + w[58] = __byte_perm (w[12], w[13], selector); + w[57] = __byte_perm (w[11], w[12], selector); + w[56] = __byte_perm (w[10], w[11], selector); + w[55] = __byte_perm (w[ 9], w[10], selector); + w[54] = __byte_perm (w[ 8], w[ 9], selector); + w[53] = __byte_perm (w[ 7], w[ 8], selector); + w[52] = __byte_perm (w[ 6], w[ 7], selector); + w[51] = __byte_perm (w[ 5], w[ 6], selector); + w[50] = __byte_perm (w[ 4], w[ 5], selector); + w[49] = __byte_perm (w[ 3], w[ 4], selector); + w[48] = __byte_perm (w[ 2], w[ 3], selector); + w[47] = __byte_perm (w[ 1], w[ 2], selector); + w[46] = __byte_perm (w[ 0], w[ 1], selector); + w[45] = __byte_perm ( 0, w[ 0], selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = __byte_perm (w[16], w[17], selector); + w[62] = __byte_perm (w[15], w[16], selector); + w[61] = __byte_perm (w[14], w[15], selector); + w[60] = __byte_perm (w[13], w[14], selector); + w[59] = __byte_perm (w[12], w[13], selector); + w[58] = __byte_perm (w[11], w[12], selector); + w[57] = __byte_perm (w[10], w[11], selector); + w[56] = __byte_perm (w[ 9], w[10], selector); + w[55] = __byte_perm (w[ 8], w[ 9], selector); + w[54] = __byte_perm (w[ 7], w[ 8], selector); + w[53] = __byte_perm (w[ 6], w[ 7], selector); + w[52] = __byte_perm (w[ 5], w[ 6], selector); + w[51] = __byte_perm (w[ 4], w[ 5], selector); + w[50] = __byte_perm (w[ 3], w[ 4], selector); + w[49] = __byte_perm (w[ 2], w[ 3], selector); + w[48] = __byte_perm (w[ 1], w[ 2], selector); + w[47] = __byte_perm (w[ 0], w[ 1], selector); + w[46] = __byte_perm ( 0, w[ 0], selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = __byte_perm (w[15], w[16], selector); + w[62] = __byte_perm (w[14], w[15], selector); + w[61] = __byte_perm (w[13], w[14], selector); + w[60] = __byte_perm (w[12], w[13], selector); + w[59] = __byte_perm (w[11], w[12], selector); + w[58] = __byte_perm (w[10], w[11], selector); + w[57] = __byte_perm (w[ 9], w[10], selector); + w[56] = __byte_perm (w[ 8], w[ 9], selector); + w[55] = __byte_perm (w[ 7], w[ 8], selector); + w[54] = __byte_perm (w[ 6], w[ 7], selector); + w[53] = __byte_perm (w[ 5], w[ 6], selector); + w[52] = __byte_perm (w[ 4], w[ 5], selector); + w[51] = __byte_perm (w[ 3], w[ 4], selector); + w[50] = __byte_perm (w[ 2], w[ 3], selector); + w[49] = __byte_perm (w[ 1], w[ 2], selector); + w[48] = __byte_perm (w[ 0], w[ 1], selector); + w[47] = __byte_perm ( 0, w[ 0], selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = __byte_perm (w[14], w[15], selector); + w[62] = __byte_perm (w[13], w[14], selector); + w[61] = __byte_perm (w[12], w[13], selector); + w[60] = __byte_perm (w[11], w[12], selector); + w[59] = __byte_perm (w[10], w[11], selector); + w[58] = __byte_perm (w[ 9], w[10], selector); + w[57] = __byte_perm (w[ 8], w[ 9], selector); + w[56] = __byte_perm (w[ 7], w[ 8], selector); + w[55] = __byte_perm (w[ 6], w[ 7], selector); + w[54] = __byte_perm (w[ 5], w[ 6], selector); + w[53] = __byte_perm (w[ 4], w[ 5], selector); + w[52] = __byte_perm (w[ 3], w[ 4], selector); + w[51] = __byte_perm (w[ 2], w[ 3], selector); + w[50] = __byte_perm (w[ 1], w[ 2], selector); + w[49] = __byte_perm (w[ 0], w[ 1], selector); + w[48] = __byte_perm ( 0, w[ 0], selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = __byte_perm (w[13], w[14], selector); + w[62] = __byte_perm (w[12], w[13], selector); + w[61] = __byte_perm (w[11], w[12], selector); + w[60] = __byte_perm (w[10], w[11], selector); + w[59] = __byte_perm (w[ 9], w[10], selector); + w[58] = __byte_perm (w[ 8], w[ 9], selector); + w[57] = __byte_perm (w[ 7], w[ 8], selector); + w[56] = __byte_perm (w[ 6], w[ 7], selector); + w[55] = __byte_perm (w[ 5], w[ 6], selector); + w[54] = __byte_perm (w[ 4], w[ 5], selector); + w[53] = __byte_perm (w[ 3], w[ 4], selector); + w[52] = __byte_perm (w[ 2], w[ 3], selector); + w[51] = __byte_perm (w[ 1], w[ 2], selector); + w[50] = __byte_perm (w[ 0], w[ 1], selector); + w[49] = __byte_perm ( 0, w[ 0], selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = __byte_perm (w[12], w[13], selector); + w[62] = __byte_perm (w[11], w[12], selector); + w[61] = __byte_perm (w[10], w[11], selector); + w[60] = __byte_perm (w[ 9], w[10], selector); + w[59] = __byte_perm (w[ 8], w[ 9], selector); + w[58] = __byte_perm (w[ 7], w[ 8], selector); + w[57] = __byte_perm (w[ 6], w[ 7], selector); + w[56] = __byte_perm (w[ 5], w[ 6], selector); + w[55] = __byte_perm (w[ 4], w[ 5], selector); + w[54] = __byte_perm (w[ 3], w[ 4], selector); + w[53] = __byte_perm (w[ 2], w[ 3], selector); + w[52] = __byte_perm (w[ 1], w[ 2], selector); + w[51] = __byte_perm (w[ 0], w[ 1], selector); + w[50] = __byte_perm ( 0, w[ 0], selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = __byte_perm (w[11], w[12], selector); + w[62] = __byte_perm (w[10], w[11], selector); + w[61] = __byte_perm (w[ 9], w[10], selector); + w[60] = __byte_perm (w[ 8], w[ 9], selector); + w[59] = __byte_perm (w[ 7], w[ 8], selector); + w[58] = __byte_perm (w[ 6], w[ 7], selector); + w[57] = __byte_perm (w[ 5], w[ 6], selector); + w[56] = __byte_perm (w[ 4], w[ 5], selector); + w[55] = __byte_perm (w[ 3], w[ 4], selector); + w[54] = __byte_perm (w[ 2], w[ 3], selector); + w[53] = __byte_perm (w[ 1], w[ 2], selector); + w[52] = __byte_perm (w[ 0], w[ 1], selector); + w[51] = __byte_perm ( 0, w[ 0], selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = __byte_perm (w[10], w[11], selector); + w[62] = __byte_perm (w[ 9], w[10], selector); + w[61] = __byte_perm (w[ 8], w[ 9], selector); + w[60] = __byte_perm (w[ 7], w[ 8], selector); + w[59] = __byte_perm (w[ 6], w[ 7], selector); + w[58] = __byte_perm (w[ 5], w[ 6], selector); + w[57] = __byte_perm (w[ 4], w[ 5], selector); + w[56] = __byte_perm (w[ 3], w[ 4], selector); + w[55] = __byte_perm (w[ 2], w[ 3], selector); + w[54] = __byte_perm (w[ 1], w[ 2], selector); + w[53] = __byte_perm (w[ 0], w[ 1], selector); + w[52] = __byte_perm ( 0, w[ 0], selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = __byte_perm (w[ 9], w[10], selector); + w[62] = __byte_perm (w[ 8], w[ 9], selector); + w[61] = __byte_perm (w[ 7], w[ 8], selector); + w[60] = __byte_perm (w[ 6], w[ 7], selector); + w[59] = __byte_perm (w[ 5], w[ 6], selector); + w[58] = __byte_perm (w[ 4], w[ 5], selector); + w[57] = __byte_perm (w[ 3], w[ 4], selector); + w[56] = __byte_perm (w[ 2], w[ 3], selector); + w[55] = __byte_perm (w[ 1], w[ 2], selector); + w[54] = __byte_perm (w[ 0], w[ 1], selector); + w[53] = __byte_perm ( 0, w[ 0], selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = __byte_perm (w[ 8], w[ 9], selector); + w[62] = __byte_perm (w[ 7], w[ 8], selector); + w[61] = __byte_perm (w[ 6], w[ 7], selector); + w[60] = __byte_perm (w[ 5], w[ 6], selector); + w[59] = __byte_perm (w[ 4], w[ 5], selector); + w[58] = __byte_perm (w[ 3], w[ 4], selector); + w[57] = __byte_perm (w[ 2], w[ 3], selector); + w[56] = __byte_perm (w[ 1], w[ 2], selector); + w[55] = __byte_perm (w[ 0], w[ 1], selector); + w[54] = __byte_perm ( 0, w[ 0], selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = __byte_perm (w[ 7], w[ 8], selector); + w[62] = __byte_perm (w[ 6], w[ 7], selector); + w[61] = __byte_perm (w[ 5], w[ 6], selector); + w[60] = __byte_perm (w[ 4], w[ 5], selector); + w[59] = __byte_perm (w[ 3], w[ 4], selector); + w[58] = __byte_perm (w[ 2], w[ 3], selector); + w[57] = __byte_perm (w[ 1], w[ 2], selector); + w[56] = __byte_perm (w[ 0], w[ 1], selector); + w[55] = __byte_perm ( 0, w[ 0], selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = __byte_perm (w[ 6], w[ 7], selector); + w[62] = __byte_perm (w[ 5], w[ 6], selector); + w[61] = __byte_perm (w[ 4], w[ 5], selector); + w[60] = __byte_perm (w[ 3], w[ 4], selector); + w[59] = __byte_perm (w[ 2], w[ 3], selector); + w[58] = __byte_perm (w[ 1], w[ 2], selector); + w[57] = __byte_perm (w[ 0], w[ 1], selector); + w[56] = __byte_perm ( 0, w[ 0], selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = __byte_perm (w[ 5], w[ 6], selector); + w[62] = __byte_perm (w[ 4], w[ 5], selector); + w[61] = __byte_perm (w[ 3], w[ 4], selector); + w[60] = __byte_perm (w[ 2], w[ 3], selector); + w[59] = __byte_perm (w[ 1], w[ 2], selector); + w[58] = __byte_perm (w[ 0], w[ 1], selector); + w[57] = __byte_perm ( 0, w[ 0], selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = __byte_perm (w[ 4], w[ 5], selector); + w[62] = __byte_perm (w[ 3], w[ 4], selector); + w[61] = __byte_perm (w[ 2], w[ 3], selector); + w[60] = __byte_perm (w[ 1], w[ 2], selector); + w[59] = __byte_perm (w[ 0], w[ 1], selector); + w[58] = __byte_perm ( 0, w[ 0], selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = __byte_perm (w[ 3], w[ 4], selector); + w[62] = __byte_perm (w[ 2], w[ 3], selector); + w[61] = __byte_perm (w[ 1], w[ 2], selector); + w[60] = __byte_perm (w[ 0], w[ 1], selector); + w[59] = __byte_perm ( 0, w[ 0], selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = __byte_perm (w[ 2], w[ 3], selector); + w[62] = __byte_perm (w[ 1], w[ 2], selector); + w[61] = __byte_perm (w[ 0], w[ 1], selector); + w[60] = __byte_perm ( 0, w[ 0], selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = __byte_perm (w[ 1], w[ 2], selector); + w[62] = __byte_perm (w[ 0], w[ 1], selector); + w[61] = __byte_perm ( 0, w[ 0], selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = __byte_perm (w[ 0], w[ 1], selector); + w[62] = __byte_perm ( 0, w[ 0], selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = __byte_perm ( 0, w[ 0], selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif +} + +void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + switch (offset / 4) + { + case 0: + w[63] = amd_bytealign (w[62], w[63], offset); + w[62] = amd_bytealign (w[61], w[62], offset); + w[61] = amd_bytealign (w[60], w[61], offset); + w[60] = amd_bytealign (w[59], w[60], offset); + w[59] = amd_bytealign (w[58], w[59], offset); + w[58] = amd_bytealign (w[57], w[58], offset); + w[57] = amd_bytealign (w[56], w[57], offset); + w[56] = amd_bytealign (w[55], w[56], offset); + w[55] = amd_bytealign (w[54], w[55], offset); + w[54] = amd_bytealign (w[53], w[54], offset); + w[53] = amd_bytealign (w[52], w[53], offset); + w[52] = amd_bytealign (w[51], w[52], offset); + w[51] = amd_bytealign (w[50], w[51], offset); + w[50] = amd_bytealign (w[49], w[50], offset); + w[49] = amd_bytealign (w[48], w[49], offset); + w[48] = amd_bytealign (w[47], w[48], offset); + w[47] = amd_bytealign (w[46], w[47], offset); + w[46] = amd_bytealign (w[45], w[46], offset); + w[45] = amd_bytealign (w[44], w[45], offset); + w[44] = amd_bytealign (w[43], w[44], offset); + w[43] = amd_bytealign (w[42], w[43], offset); + w[42] = amd_bytealign (w[41], w[42], offset); + w[41] = amd_bytealign (w[40], w[41], offset); + w[40] = amd_bytealign (w[39], w[40], offset); + w[39] = amd_bytealign (w[38], w[39], offset); + w[38] = amd_bytealign (w[37], w[38], offset); + w[37] = amd_bytealign (w[36], w[37], offset); + w[36] = amd_bytealign (w[35], w[36], offset); + w[35] = amd_bytealign (w[34], w[35], offset); + w[34] = amd_bytealign (w[33], w[34], offset); + w[33] = amd_bytealign (w[32], w[33], offset); + w[32] = amd_bytealign (w[31], w[32], offset); + w[31] = amd_bytealign (w[30], w[31], offset); + w[30] = amd_bytealign (w[29], w[30], offset); + w[29] = amd_bytealign (w[28], w[29], offset); + w[28] = amd_bytealign (w[27], w[28], offset); + w[27] = amd_bytealign (w[26], w[27], offset); + w[26] = amd_bytealign (w[25], w[26], offset); + w[25] = amd_bytealign (w[24], w[25], offset); + w[24] = amd_bytealign (w[23], w[24], offset); + w[23] = amd_bytealign (w[22], w[23], offset); + w[22] = amd_bytealign (w[21], w[22], offset); + w[21] = amd_bytealign (w[20], w[21], offset); + w[20] = amd_bytealign (w[19], w[20], offset); + w[19] = amd_bytealign (w[18], w[19], offset); + w[18] = amd_bytealign (w[17], w[18], offset); + w[17] = amd_bytealign (w[16], w[17], offset); + w[16] = amd_bytealign (w[15], w[16], offset); + w[15] = amd_bytealign (w[14], w[15], offset); + w[14] = amd_bytealign (w[13], w[14], offset); + w[13] = amd_bytealign (w[12], w[13], offset); + w[12] = amd_bytealign (w[11], w[12], offset); + w[11] = amd_bytealign (w[10], w[11], offset); + w[10] = amd_bytealign (w[ 9], w[10], offset); + w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = amd_bytealign (w[61], w[62], offset); + w[62] = amd_bytealign (w[60], w[61], offset); + w[61] = amd_bytealign (w[59], w[60], offset); + w[60] = amd_bytealign (w[58], w[59], offset); + w[59] = amd_bytealign (w[57], w[58], offset); + w[58] = amd_bytealign (w[56], w[57], offset); + w[57] = amd_bytealign (w[55], w[56], offset); + w[56] = amd_bytealign (w[54], w[55], offset); + w[55] = amd_bytealign (w[53], w[54], offset); + w[54] = amd_bytealign (w[52], w[53], offset); + w[53] = amd_bytealign (w[51], w[52], offset); + w[52] = amd_bytealign (w[50], w[51], offset); + w[51] = amd_bytealign (w[49], w[50], offset); + w[50] = amd_bytealign (w[48], w[49], offset); + w[49] = amd_bytealign (w[47], w[48], offset); + w[48] = amd_bytealign (w[46], w[47], offset); + w[47] = amd_bytealign (w[45], w[46], offset); + w[46] = amd_bytealign (w[44], w[45], offset); + w[45] = amd_bytealign (w[43], w[44], offset); + w[44] = amd_bytealign (w[42], w[43], offset); + w[43] = amd_bytealign (w[41], w[42], offset); + w[42] = amd_bytealign (w[40], w[41], offset); + w[41] = amd_bytealign (w[39], w[40], offset); + w[40] = amd_bytealign (w[38], w[39], offset); + w[39] = amd_bytealign (w[37], w[38], offset); + w[38] = amd_bytealign (w[36], w[37], offset); + w[37] = amd_bytealign (w[35], w[36], offset); + w[36] = amd_bytealign (w[34], w[35], offset); + w[35] = amd_bytealign (w[33], w[34], offset); + w[34] = amd_bytealign (w[32], w[33], offset); + w[33] = amd_bytealign (w[31], w[32], offset); + w[32] = amd_bytealign (w[30], w[31], offset); + w[31] = amd_bytealign (w[29], w[30], offset); + w[30] = amd_bytealign (w[28], w[29], offset); + w[29] = amd_bytealign (w[27], w[28], offset); + w[28] = amd_bytealign (w[26], w[27], offset); + w[27] = amd_bytealign (w[25], w[26], offset); + w[26] = amd_bytealign (w[24], w[25], offset); + w[25] = amd_bytealign (w[23], w[24], offset); + w[24] = amd_bytealign (w[22], w[23], offset); + w[23] = amd_bytealign (w[21], w[22], offset); + w[22] = amd_bytealign (w[20], w[21], offset); + w[21] = amd_bytealign (w[19], w[20], offset); + w[20] = amd_bytealign (w[18], w[19], offset); + w[19] = amd_bytealign (w[17], w[18], offset); + w[18] = amd_bytealign (w[16], w[17], offset); + w[17] = amd_bytealign (w[15], w[16], offset); + w[16] = amd_bytealign (w[14], w[15], offset); + w[15] = amd_bytealign (w[13], w[14], offset); + w[14] = amd_bytealign (w[12], w[13], offset); + w[13] = amd_bytealign (w[11], w[12], offset); + w[12] = amd_bytealign (w[10], w[11], offset); + w[11] = amd_bytealign (w[ 9], w[10], offset); + w[10] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[ 0] = 0; + + break; + + case 2: + w[63] = amd_bytealign (w[60], w[61], offset); + w[62] = amd_bytealign (w[59], w[60], offset); + w[61] = amd_bytealign (w[58], w[59], offset); + w[60] = amd_bytealign (w[57], w[58], offset); + w[59] = amd_bytealign (w[56], w[57], offset); + w[58] = amd_bytealign (w[55], w[56], offset); + w[57] = amd_bytealign (w[54], w[55], offset); + w[56] = amd_bytealign (w[53], w[54], offset); + w[55] = amd_bytealign (w[52], w[53], offset); + w[54] = amd_bytealign (w[51], w[52], offset); + w[53] = amd_bytealign (w[50], w[51], offset); + w[52] = amd_bytealign (w[49], w[50], offset); + w[51] = amd_bytealign (w[48], w[49], offset); + w[50] = amd_bytealign (w[47], w[48], offset); + w[49] = amd_bytealign (w[46], w[47], offset); + w[48] = amd_bytealign (w[45], w[46], offset); + w[47] = amd_bytealign (w[44], w[45], offset); + w[46] = amd_bytealign (w[43], w[44], offset); + w[45] = amd_bytealign (w[42], w[43], offset); + w[44] = amd_bytealign (w[41], w[42], offset); + w[43] = amd_bytealign (w[40], w[41], offset); + w[42] = amd_bytealign (w[39], w[40], offset); + w[41] = amd_bytealign (w[38], w[39], offset); + w[40] = amd_bytealign (w[37], w[38], offset); + w[39] = amd_bytealign (w[36], w[37], offset); + w[38] = amd_bytealign (w[35], w[36], offset); + w[37] = amd_bytealign (w[34], w[35], offset); + w[36] = amd_bytealign (w[33], w[34], offset); + w[35] = amd_bytealign (w[32], w[33], offset); + w[34] = amd_bytealign (w[31], w[32], offset); + w[33] = amd_bytealign (w[30], w[31], offset); + w[32] = amd_bytealign (w[29], w[30], offset); + w[31] = amd_bytealign (w[28], w[29], offset); + w[30] = amd_bytealign (w[27], w[28], offset); + w[29] = amd_bytealign (w[26], w[27], offset); + w[28] = amd_bytealign (w[25], w[26], offset); + w[27] = amd_bytealign (w[24], w[25], offset); + w[26] = amd_bytealign (w[23], w[24], offset); + w[25] = amd_bytealign (w[22], w[23], offset); + w[24] = amd_bytealign (w[21], w[22], offset); + w[23] = amd_bytealign (w[20], w[21], offset); + w[22] = amd_bytealign (w[19], w[20], offset); + w[21] = amd_bytealign (w[18], w[19], offset); + w[20] = amd_bytealign (w[17], w[18], offset); + w[19] = amd_bytealign (w[16], w[17], offset); + w[18] = amd_bytealign (w[15], w[16], offset); + w[17] = amd_bytealign (w[14], w[15], offset); + w[16] = amd_bytealign (w[13], w[14], offset); + w[15] = amd_bytealign (w[12], w[13], offset); + w[14] = amd_bytealign (w[11], w[12], offset); + w[13] = amd_bytealign (w[10], w[11], offset); + w[12] = amd_bytealign (w[ 9], w[10], offset); + w[11] = amd_bytealign (w[ 8], w[ 9], offset); + w[10] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = amd_bytealign (w[59], w[60], offset); + w[62] = amd_bytealign (w[58], w[59], offset); + w[61] = amd_bytealign (w[57], w[58], offset); + w[60] = amd_bytealign (w[56], w[57], offset); + w[59] = amd_bytealign (w[55], w[56], offset); + w[58] = amd_bytealign (w[54], w[55], offset); + w[57] = amd_bytealign (w[53], w[54], offset); + w[56] = amd_bytealign (w[52], w[53], offset); + w[55] = amd_bytealign (w[51], w[52], offset); + w[54] = amd_bytealign (w[50], w[51], offset); + w[53] = amd_bytealign (w[49], w[50], offset); + w[52] = amd_bytealign (w[48], w[49], offset); + w[51] = amd_bytealign (w[47], w[48], offset); + w[50] = amd_bytealign (w[46], w[47], offset); + w[49] = amd_bytealign (w[45], w[46], offset); + w[48] = amd_bytealign (w[44], w[45], offset); + w[47] = amd_bytealign (w[43], w[44], offset); + w[46] = amd_bytealign (w[42], w[43], offset); + w[45] = amd_bytealign (w[41], w[42], offset); + w[44] = amd_bytealign (w[40], w[41], offset); + w[43] = amd_bytealign (w[39], w[40], offset); + w[42] = amd_bytealign (w[38], w[39], offset); + w[41] = amd_bytealign (w[37], w[38], offset); + w[40] = amd_bytealign (w[36], w[37], offset); + w[39] = amd_bytealign (w[35], w[36], offset); + w[38] = amd_bytealign (w[34], w[35], offset); + w[37] = amd_bytealign (w[33], w[34], offset); + w[36] = amd_bytealign (w[32], w[33], offset); + w[35] = amd_bytealign (w[31], w[32], offset); + w[34] = amd_bytealign (w[30], w[31], offset); + w[33] = amd_bytealign (w[29], w[30], offset); + w[32] = amd_bytealign (w[28], w[29], offset); + w[31] = amd_bytealign (w[27], w[28], offset); + w[30] = amd_bytealign (w[26], w[27], offset); + w[29] = amd_bytealign (w[25], w[26], offset); + w[28] = amd_bytealign (w[24], w[25], offset); + w[27] = amd_bytealign (w[23], w[24], offset); + w[26] = amd_bytealign (w[22], w[23], offset); + w[25] = amd_bytealign (w[21], w[22], offset); + w[24] = amd_bytealign (w[20], w[21], offset); + w[23] = amd_bytealign (w[19], w[20], offset); + w[22] = amd_bytealign (w[18], w[19], offset); + w[21] = amd_bytealign (w[17], w[18], offset); + w[20] = amd_bytealign (w[16], w[17], offset); + w[19] = amd_bytealign (w[15], w[16], offset); + w[18] = amd_bytealign (w[14], w[15], offset); + w[17] = amd_bytealign (w[13], w[14], offset); + w[16] = amd_bytealign (w[12], w[13], offset); + w[15] = amd_bytealign (w[11], w[12], offset); + w[14] = amd_bytealign (w[10], w[11], offset); + w[13] = amd_bytealign (w[ 9], w[10], offset); + w[12] = amd_bytealign (w[ 8], w[ 9], offset); + w[11] = amd_bytealign (w[ 7], w[ 8], offset); + w[10] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = amd_bytealign (w[58], w[59], offset); + w[62] = amd_bytealign (w[57], w[58], offset); + w[61] = amd_bytealign (w[56], w[57], offset); + w[60] = amd_bytealign (w[55], w[56], offset); + w[59] = amd_bytealign (w[54], w[55], offset); + w[58] = amd_bytealign (w[53], w[54], offset); + w[57] = amd_bytealign (w[52], w[53], offset); + w[56] = amd_bytealign (w[51], w[52], offset); + w[55] = amd_bytealign (w[50], w[51], offset); + w[54] = amd_bytealign (w[49], w[50], offset); + w[53] = amd_bytealign (w[48], w[49], offset); + w[52] = amd_bytealign (w[47], w[48], offset); + w[51] = amd_bytealign (w[46], w[47], offset); + w[50] = amd_bytealign (w[45], w[46], offset); + w[49] = amd_bytealign (w[44], w[45], offset); + w[48] = amd_bytealign (w[43], w[44], offset); + w[47] = amd_bytealign (w[42], w[43], offset); + w[46] = amd_bytealign (w[41], w[42], offset); + w[45] = amd_bytealign (w[40], w[41], offset); + w[44] = amd_bytealign (w[39], w[40], offset); + w[43] = amd_bytealign (w[38], w[39], offset); + w[42] = amd_bytealign (w[37], w[38], offset); + w[41] = amd_bytealign (w[36], w[37], offset); + w[40] = amd_bytealign (w[35], w[36], offset); + w[39] = amd_bytealign (w[34], w[35], offset); + w[38] = amd_bytealign (w[33], w[34], offset); + w[37] = amd_bytealign (w[32], w[33], offset); + w[36] = amd_bytealign (w[31], w[32], offset); + w[35] = amd_bytealign (w[30], w[31], offset); + w[34] = amd_bytealign (w[29], w[30], offset); + w[33] = amd_bytealign (w[28], w[29], offset); + w[32] = amd_bytealign (w[27], w[28], offset); + w[31] = amd_bytealign (w[26], w[27], offset); + w[30] = amd_bytealign (w[25], w[26], offset); + w[29] = amd_bytealign (w[24], w[25], offset); + w[28] = amd_bytealign (w[23], w[24], offset); + w[27] = amd_bytealign (w[22], w[23], offset); + w[26] = amd_bytealign (w[21], w[22], offset); + w[25] = amd_bytealign (w[20], w[21], offset); + w[24] = amd_bytealign (w[19], w[20], offset); + w[23] = amd_bytealign (w[18], w[19], offset); + w[22] = amd_bytealign (w[17], w[18], offset); + w[21] = amd_bytealign (w[16], w[17], offset); + w[20] = amd_bytealign (w[15], w[16], offset); + w[19] = amd_bytealign (w[14], w[15], offset); + w[18] = amd_bytealign (w[13], w[14], offset); + w[17] = amd_bytealign (w[12], w[13], offset); + w[16] = amd_bytealign (w[11], w[12], offset); + w[15] = amd_bytealign (w[10], w[11], offset); + w[14] = amd_bytealign (w[ 9], w[10], offset); + w[13] = amd_bytealign (w[ 8], w[ 9], offset); + w[12] = amd_bytealign (w[ 7], w[ 8], offset); + w[11] = amd_bytealign (w[ 6], w[ 7], offset); + w[10] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = amd_bytealign (w[57], w[58], offset); + w[62] = amd_bytealign (w[56], w[57], offset); + w[61] = amd_bytealign (w[55], w[56], offset); + w[60] = amd_bytealign (w[54], w[55], offset); + w[59] = amd_bytealign (w[53], w[54], offset); + w[58] = amd_bytealign (w[52], w[53], offset); + w[57] = amd_bytealign (w[51], w[52], offset); + w[56] = amd_bytealign (w[50], w[51], offset); + w[55] = amd_bytealign (w[49], w[50], offset); + w[54] = amd_bytealign (w[48], w[49], offset); + w[53] = amd_bytealign (w[47], w[48], offset); + w[52] = amd_bytealign (w[46], w[47], offset); + w[51] = amd_bytealign (w[45], w[46], offset); + w[50] = amd_bytealign (w[44], w[45], offset); + w[49] = amd_bytealign (w[43], w[44], offset); + w[48] = amd_bytealign (w[42], w[43], offset); + w[47] = amd_bytealign (w[41], w[42], offset); + w[46] = amd_bytealign (w[40], w[41], offset); + w[45] = amd_bytealign (w[39], w[40], offset); + w[44] = amd_bytealign (w[38], w[39], offset); + w[43] = amd_bytealign (w[37], w[38], offset); + w[42] = amd_bytealign (w[36], w[37], offset); + w[41] = amd_bytealign (w[35], w[36], offset); + w[40] = amd_bytealign (w[34], w[35], offset); + w[39] = amd_bytealign (w[33], w[34], offset); + w[38] = amd_bytealign (w[32], w[33], offset); + w[37] = amd_bytealign (w[31], w[32], offset); + w[36] = amd_bytealign (w[30], w[31], offset); + w[35] = amd_bytealign (w[29], w[30], offset); + w[34] = amd_bytealign (w[28], w[29], offset); + w[33] = amd_bytealign (w[27], w[28], offset); + w[32] = amd_bytealign (w[26], w[27], offset); + w[31] = amd_bytealign (w[25], w[26], offset); + w[30] = amd_bytealign (w[24], w[25], offset); + w[29] = amd_bytealign (w[23], w[24], offset); + w[28] = amd_bytealign (w[22], w[23], offset); + w[27] = amd_bytealign (w[21], w[22], offset); + w[26] = amd_bytealign (w[20], w[21], offset); + w[25] = amd_bytealign (w[19], w[20], offset); + w[24] = amd_bytealign (w[18], w[19], offset); + w[23] = amd_bytealign (w[17], w[18], offset); + w[22] = amd_bytealign (w[16], w[17], offset); + w[21] = amd_bytealign (w[15], w[16], offset); + w[20] = amd_bytealign (w[14], w[15], offset); + w[19] = amd_bytealign (w[13], w[14], offset); + w[18] = amd_bytealign (w[12], w[13], offset); + w[17] = amd_bytealign (w[11], w[12], offset); + w[16] = amd_bytealign (w[10], w[11], offset); + w[15] = amd_bytealign (w[ 9], w[10], offset); + w[14] = amd_bytealign (w[ 8], w[ 9], offset); + w[13] = amd_bytealign (w[ 7], w[ 8], offset); + w[12] = amd_bytealign (w[ 6], w[ 7], offset); + w[11] = amd_bytealign (w[ 5], w[ 6], offset); + w[10] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = amd_bytealign (w[56], w[57], offset); + w[62] = amd_bytealign (w[55], w[56], offset); + w[61] = amd_bytealign (w[54], w[55], offset); + w[60] = amd_bytealign (w[53], w[54], offset); + w[59] = amd_bytealign (w[52], w[53], offset); + w[58] = amd_bytealign (w[51], w[52], offset); + w[57] = amd_bytealign (w[50], w[51], offset); + w[56] = amd_bytealign (w[49], w[50], offset); + w[55] = amd_bytealign (w[48], w[49], offset); + w[54] = amd_bytealign (w[47], w[48], offset); + w[53] = amd_bytealign (w[46], w[47], offset); + w[52] = amd_bytealign (w[45], w[46], offset); + w[51] = amd_bytealign (w[44], w[45], offset); + w[50] = amd_bytealign (w[43], w[44], offset); + w[49] = amd_bytealign (w[42], w[43], offset); + w[48] = amd_bytealign (w[41], w[42], offset); + w[47] = amd_bytealign (w[40], w[41], offset); + w[46] = amd_bytealign (w[39], w[40], offset); + w[45] = amd_bytealign (w[38], w[39], offset); + w[44] = amd_bytealign (w[37], w[38], offset); + w[43] = amd_bytealign (w[36], w[37], offset); + w[42] = amd_bytealign (w[35], w[36], offset); + w[41] = amd_bytealign (w[34], w[35], offset); + w[40] = amd_bytealign (w[33], w[34], offset); + w[39] = amd_bytealign (w[32], w[33], offset); + w[38] = amd_bytealign (w[31], w[32], offset); + w[37] = amd_bytealign (w[30], w[31], offset); + w[36] = amd_bytealign (w[29], w[30], offset); + w[35] = amd_bytealign (w[28], w[29], offset); + w[34] = amd_bytealign (w[27], w[28], offset); + w[33] = amd_bytealign (w[26], w[27], offset); + w[32] = amd_bytealign (w[25], w[26], offset); + w[31] = amd_bytealign (w[24], w[25], offset); + w[30] = amd_bytealign (w[23], w[24], offset); + w[29] = amd_bytealign (w[22], w[23], offset); + w[28] = amd_bytealign (w[21], w[22], offset); + w[27] = amd_bytealign (w[20], w[21], offset); + w[26] = amd_bytealign (w[19], w[20], offset); + w[25] = amd_bytealign (w[18], w[19], offset); + w[24] = amd_bytealign (w[17], w[18], offset); + w[23] = amd_bytealign (w[16], w[17], offset); + w[22] = amd_bytealign (w[15], w[16], offset); + w[21] = amd_bytealign (w[14], w[15], offset); + w[20] = amd_bytealign (w[13], w[14], offset); + w[19] = amd_bytealign (w[12], w[13], offset); + w[18] = amd_bytealign (w[11], w[12], offset); + w[17] = amd_bytealign (w[10], w[11], offset); + w[16] = amd_bytealign (w[ 9], w[10], offset); + w[15] = amd_bytealign (w[ 8], w[ 9], offset); + w[14] = amd_bytealign (w[ 7], w[ 8], offset); + w[13] = amd_bytealign (w[ 6], w[ 7], offset); + w[12] = amd_bytealign (w[ 5], w[ 6], offset); + w[11] = amd_bytealign (w[ 4], w[ 5], offset); + w[10] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = amd_bytealign (w[55], w[56], offset); + w[62] = amd_bytealign (w[54], w[55], offset); + w[61] = amd_bytealign (w[53], w[54], offset); + w[60] = amd_bytealign (w[52], w[53], offset); + w[59] = amd_bytealign (w[51], w[52], offset); + w[58] = amd_bytealign (w[50], w[51], offset); + w[57] = amd_bytealign (w[49], w[50], offset); + w[56] = amd_bytealign (w[48], w[49], offset); + w[55] = amd_bytealign (w[47], w[48], offset); + w[54] = amd_bytealign (w[46], w[47], offset); + w[53] = amd_bytealign (w[45], w[46], offset); + w[52] = amd_bytealign (w[44], w[45], offset); + w[51] = amd_bytealign (w[43], w[44], offset); + w[50] = amd_bytealign (w[42], w[43], offset); + w[49] = amd_bytealign (w[41], w[42], offset); + w[48] = amd_bytealign (w[40], w[41], offset); + w[47] = amd_bytealign (w[39], w[40], offset); + w[46] = amd_bytealign (w[38], w[39], offset); + w[45] = amd_bytealign (w[37], w[38], offset); + w[44] = amd_bytealign (w[36], w[37], offset); + w[43] = amd_bytealign (w[35], w[36], offset); + w[42] = amd_bytealign (w[34], w[35], offset); + w[41] = amd_bytealign (w[33], w[34], offset); + w[40] = amd_bytealign (w[32], w[33], offset); + w[39] = amd_bytealign (w[31], w[32], offset); + w[38] = amd_bytealign (w[30], w[31], offset); + w[37] = amd_bytealign (w[29], w[30], offset); + w[36] = amd_bytealign (w[28], w[29], offset); + w[35] = amd_bytealign (w[27], w[28], offset); + w[34] = amd_bytealign (w[26], w[27], offset); + w[33] = amd_bytealign (w[25], w[26], offset); + w[32] = amd_bytealign (w[24], w[25], offset); + w[31] = amd_bytealign (w[23], w[24], offset); + w[30] = amd_bytealign (w[22], w[23], offset); + w[29] = amd_bytealign (w[21], w[22], offset); + w[28] = amd_bytealign (w[20], w[21], offset); + w[27] = amd_bytealign (w[19], w[20], offset); + w[26] = amd_bytealign (w[18], w[19], offset); + w[25] = amd_bytealign (w[17], w[18], offset); + w[24] = amd_bytealign (w[16], w[17], offset); + w[23] = amd_bytealign (w[15], w[16], offset); + w[22] = amd_bytealign (w[14], w[15], offset); + w[21] = amd_bytealign (w[13], w[14], offset); + w[20] = amd_bytealign (w[12], w[13], offset); + w[19] = amd_bytealign (w[11], w[12], offset); + w[18] = amd_bytealign (w[10], w[11], offset); + w[17] = amd_bytealign (w[ 9], w[10], offset); + w[16] = amd_bytealign (w[ 8], w[ 9], offset); + w[15] = amd_bytealign (w[ 7], w[ 8], offset); + w[14] = amd_bytealign (w[ 6], w[ 7], offset); + w[13] = amd_bytealign (w[ 5], w[ 6], offset); + w[12] = amd_bytealign (w[ 4], w[ 5], offset); + w[11] = amd_bytealign (w[ 3], w[ 4], offset); + w[10] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = amd_bytealign (w[54], w[55], offset); + w[62] = amd_bytealign (w[53], w[54], offset); + w[61] = amd_bytealign (w[52], w[53], offset); + w[60] = amd_bytealign (w[51], w[52], offset); + w[59] = amd_bytealign (w[50], w[51], offset); + w[58] = amd_bytealign (w[49], w[50], offset); + w[57] = amd_bytealign (w[48], w[49], offset); + w[56] = amd_bytealign (w[47], w[48], offset); + w[55] = amd_bytealign (w[46], w[47], offset); + w[54] = amd_bytealign (w[45], w[46], offset); + w[53] = amd_bytealign (w[44], w[45], offset); + w[52] = amd_bytealign (w[43], w[44], offset); + w[51] = amd_bytealign (w[42], w[43], offset); + w[50] = amd_bytealign (w[41], w[42], offset); + w[49] = amd_bytealign (w[40], w[41], offset); + w[48] = amd_bytealign (w[39], w[40], offset); + w[47] = amd_bytealign (w[38], w[39], offset); + w[46] = amd_bytealign (w[37], w[38], offset); + w[45] = amd_bytealign (w[36], w[37], offset); + w[44] = amd_bytealign (w[35], w[36], offset); + w[43] = amd_bytealign (w[34], w[35], offset); + w[42] = amd_bytealign (w[33], w[34], offset); + w[41] = amd_bytealign (w[32], w[33], offset); + w[40] = amd_bytealign (w[31], w[32], offset); + w[39] = amd_bytealign (w[30], w[31], offset); + w[38] = amd_bytealign (w[29], w[30], offset); + w[37] = amd_bytealign (w[28], w[29], offset); + w[36] = amd_bytealign (w[27], w[28], offset); + w[35] = amd_bytealign (w[26], w[27], offset); + w[34] = amd_bytealign (w[25], w[26], offset); + w[33] = amd_bytealign (w[24], w[25], offset); + w[32] = amd_bytealign (w[23], w[24], offset); + w[31] = amd_bytealign (w[22], w[23], offset); + w[30] = amd_bytealign (w[21], w[22], offset); + w[29] = amd_bytealign (w[20], w[21], offset); + w[28] = amd_bytealign (w[19], w[20], offset); + w[27] = amd_bytealign (w[18], w[19], offset); + w[26] = amd_bytealign (w[17], w[18], offset); + w[25] = amd_bytealign (w[16], w[17], offset); + w[24] = amd_bytealign (w[15], w[16], offset); + w[23] = amd_bytealign (w[14], w[15], offset); + w[22] = amd_bytealign (w[13], w[14], offset); + w[21] = amd_bytealign (w[12], w[13], offset); + w[20] = amd_bytealign (w[11], w[12], offset); + w[19] = amd_bytealign (w[10], w[11], offset); + w[18] = amd_bytealign (w[ 9], w[10], offset); + w[17] = amd_bytealign (w[ 8], w[ 9], offset); + w[16] = amd_bytealign (w[ 7], w[ 8], offset); + w[15] = amd_bytealign (w[ 6], w[ 7], offset); + w[14] = amd_bytealign (w[ 5], w[ 6], offset); + w[13] = amd_bytealign (w[ 4], w[ 5], offset); + w[12] = amd_bytealign (w[ 3], w[ 4], offset); + w[11] = amd_bytealign (w[ 2], w[ 3], offset); + w[10] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = amd_bytealign (w[53], w[54], offset); + w[62] = amd_bytealign (w[52], w[53], offset); + w[61] = amd_bytealign (w[51], w[52], offset); + w[60] = amd_bytealign (w[50], w[51], offset); + w[59] = amd_bytealign (w[49], w[50], offset); + w[58] = amd_bytealign (w[48], w[49], offset); + w[57] = amd_bytealign (w[47], w[48], offset); + w[56] = amd_bytealign (w[46], w[47], offset); + w[55] = amd_bytealign (w[45], w[46], offset); + w[54] = amd_bytealign (w[44], w[45], offset); + w[53] = amd_bytealign (w[43], w[44], offset); + w[52] = amd_bytealign (w[42], w[43], offset); + w[51] = amd_bytealign (w[41], w[42], offset); + w[50] = amd_bytealign (w[40], w[41], offset); + w[49] = amd_bytealign (w[39], w[40], offset); + w[48] = amd_bytealign (w[38], w[39], offset); + w[47] = amd_bytealign (w[37], w[38], offset); + w[46] = amd_bytealign (w[36], w[37], offset); + w[45] = amd_bytealign (w[35], w[36], offset); + w[44] = amd_bytealign (w[34], w[35], offset); + w[43] = amd_bytealign (w[33], w[34], offset); + w[42] = amd_bytealign (w[32], w[33], offset); + w[41] = amd_bytealign (w[31], w[32], offset); + w[40] = amd_bytealign (w[30], w[31], offset); + w[39] = amd_bytealign (w[29], w[30], offset); + w[38] = amd_bytealign (w[28], w[29], offset); + w[37] = amd_bytealign (w[27], w[28], offset); + w[36] = amd_bytealign (w[26], w[27], offset); + w[35] = amd_bytealign (w[25], w[26], offset); + w[34] = amd_bytealign (w[24], w[25], offset); + w[33] = amd_bytealign (w[23], w[24], offset); + w[32] = amd_bytealign (w[22], w[23], offset); + w[31] = amd_bytealign (w[21], w[22], offset); + w[30] = amd_bytealign (w[20], w[21], offset); + w[29] = amd_bytealign (w[19], w[20], offset); + w[28] = amd_bytealign (w[18], w[19], offset); + w[27] = amd_bytealign (w[17], w[18], offset); + w[26] = amd_bytealign (w[16], w[17], offset); + w[25] = amd_bytealign (w[15], w[16], offset); + w[24] = amd_bytealign (w[14], w[15], offset); + w[23] = amd_bytealign (w[13], w[14], offset); + w[22] = amd_bytealign (w[12], w[13], offset); + w[21] = amd_bytealign (w[11], w[12], offset); + w[20] = amd_bytealign (w[10], w[11], offset); + w[19] = amd_bytealign (w[ 9], w[10], offset); + w[18] = amd_bytealign (w[ 8], w[ 9], offset); + w[17] = amd_bytealign (w[ 7], w[ 8], offset); + w[16] = amd_bytealign (w[ 6], w[ 7], offset); + w[15] = amd_bytealign (w[ 5], w[ 6], offset); + w[14] = amd_bytealign (w[ 4], w[ 5], offset); + w[13] = amd_bytealign (w[ 3], w[ 4], offset); + w[12] = amd_bytealign (w[ 2], w[ 3], offset); + w[11] = amd_bytealign (w[ 1], w[ 2], offset); + w[10] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = amd_bytealign (w[52], w[53], offset); + w[62] = amd_bytealign (w[51], w[52], offset); + w[61] = amd_bytealign (w[50], w[51], offset); + w[60] = amd_bytealign (w[49], w[50], offset); + w[59] = amd_bytealign (w[48], w[49], offset); + w[58] = amd_bytealign (w[47], w[48], offset); + w[57] = amd_bytealign (w[46], w[47], offset); + w[56] = amd_bytealign (w[45], w[46], offset); + w[55] = amd_bytealign (w[44], w[45], offset); + w[54] = amd_bytealign (w[43], w[44], offset); + w[53] = amd_bytealign (w[42], w[43], offset); + w[52] = amd_bytealign (w[41], w[42], offset); + w[51] = amd_bytealign (w[40], w[41], offset); + w[50] = amd_bytealign (w[39], w[40], offset); + w[49] = amd_bytealign (w[38], w[39], offset); + w[48] = amd_bytealign (w[37], w[38], offset); + w[47] = amd_bytealign (w[36], w[37], offset); + w[46] = amd_bytealign (w[35], w[36], offset); + w[45] = amd_bytealign (w[34], w[35], offset); + w[44] = amd_bytealign (w[33], w[34], offset); + w[43] = amd_bytealign (w[32], w[33], offset); + w[42] = amd_bytealign (w[31], w[32], offset); + w[41] = amd_bytealign (w[30], w[31], offset); + w[40] = amd_bytealign (w[29], w[30], offset); + w[39] = amd_bytealign (w[28], w[29], offset); + w[38] = amd_bytealign (w[27], w[28], offset); + w[37] = amd_bytealign (w[26], w[27], offset); + w[36] = amd_bytealign (w[25], w[26], offset); + w[35] = amd_bytealign (w[24], w[25], offset); + w[34] = amd_bytealign (w[23], w[24], offset); + w[33] = amd_bytealign (w[22], w[23], offset); + w[32] = amd_bytealign (w[21], w[22], offset); + w[31] = amd_bytealign (w[20], w[21], offset); + w[30] = amd_bytealign (w[19], w[20], offset); + w[29] = amd_bytealign (w[18], w[19], offset); + w[28] = amd_bytealign (w[17], w[18], offset); + w[27] = amd_bytealign (w[16], w[17], offset); + w[26] = amd_bytealign (w[15], w[16], offset); + w[25] = amd_bytealign (w[14], w[15], offset); + w[24] = amd_bytealign (w[13], w[14], offset); + w[23] = amd_bytealign (w[12], w[13], offset); + w[22] = amd_bytealign (w[11], w[12], offset); + w[21] = amd_bytealign (w[10], w[11], offset); + w[20] = amd_bytealign (w[ 9], w[10], offset); + w[19] = amd_bytealign (w[ 8], w[ 9], offset); + w[18] = amd_bytealign (w[ 7], w[ 8], offset); + w[17] = amd_bytealign (w[ 6], w[ 7], offset); + w[16] = amd_bytealign (w[ 5], w[ 6], offset); + w[15] = amd_bytealign (w[ 4], w[ 5], offset); + w[14] = amd_bytealign (w[ 3], w[ 4], offset); + w[13] = amd_bytealign (w[ 2], w[ 3], offset); + w[12] = amd_bytealign (w[ 1], w[ 2], offset); + w[11] = amd_bytealign (w[ 0], w[ 1], offset); + w[10] = amd_bytealign ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = amd_bytealign (w[51], w[52], offset); + w[62] = amd_bytealign (w[50], w[51], offset); + w[61] = amd_bytealign (w[49], w[50], offset); + w[60] = amd_bytealign (w[48], w[49], offset); + w[59] = amd_bytealign (w[47], w[48], offset); + w[58] = amd_bytealign (w[46], w[47], offset); + w[57] = amd_bytealign (w[45], w[46], offset); + w[56] = amd_bytealign (w[44], w[45], offset); + w[55] = amd_bytealign (w[43], w[44], offset); + w[54] = amd_bytealign (w[42], w[43], offset); + w[53] = amd_bytealign (w[41], w[42], offset); + w[52] = amd_bytealign (w[40], w[41], offset); + w[51] = amd_bytealign (w[39], w[40], offset); + w[50] = amd_bytealign (w[38], w[39], offset); + w[49] = amd_bytealign (w[37], w[38], offset); + w[48] = amd_bytealign (w[36], w[37], offset); + w[47] = amd_bytealign (w[35], w[36], offset); + w[46] = amd_bytealign (w[34], w[35], offset); + w[45] = amd_bytealign (w[33], w[34], offset); + w[44] = amd_bytealign (w[32], w[33], offset); + w[43] = amd_bytealign (w[31], w[32], offset); + w[42] = amd_bytealign (w[30], w[31], offset); + w[41] = amd_bytealign (w[29], w[30], offset); + w[40] = amd_bytealign (w[28], w[29], offset); + w[39] = amd_bytealign (w[27], w[28], offset); + w[38] = amd_bytealign (w[26], w[27], offset); + w[37] = amd_bytealign (w[25], w[26], offset); + w[36] = amd_bytealign (w[24], w[25], offset); + w[35] = amd_bytealign (w[23], w[24], offset); + w[34] = amd_bytealign (w[22], w[23], offset); + w[33] = amd_bytealign (w[21], w[22], offset); + w[32] = amd_bytealign (w[20], w[21], offset); + w[31] = amd_bytealign (w[19], w[20], offset); + w[30] = amd_bytealign (w[18], w[19], offset); + w[29] = amd_bytealign (w[17], w[18], offset); + w[28] = amd_bytealign (w[16], w[17], offset); + w[27] = amd_bytealign (w[15], w[16], offset); + w[26] = amd_bytealign (w[14], w[15], offset); + w[25] = amd_bytealign (w[13], w[14], offset); + w[24] = amd_bytealign (w[12], w[13], offset); + w[23] = amd_bytealign (w[11], w[12], offset); + w[22] = amd_bytealign (w[10], w[11], offset); + w[21] = amd_bytealign (w[ 9], w[10], offset); + w[20] = amd_bytealign (w[ 8], w[ 9], offset); + w[19] = amd_bytealign (w[ 7], w[ 8], offset); + w[18] = amd_bytealign (w[ 6], w[ 7], offset); + w[17] = amd_bytealign (w[ 5], w[ 6], offset); + w[16] = amd_bytealign (w[ 4], w[ 5], offset); + w[15] = amd_bytealign (w[ 3], w[ 4], offset); + w[14] = amd_bytealign (w[ 2], w[ 3], offset); + w[13] = amd_bytealign (w[ 1], w[ 2], offset); + w[12] = amd_bytealign (w[ 0], w[ 1], offset); + w[11] = amd_bytealign ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = amd_bytealign (w[50], w[51], offset); + w[62] = amd_bytealign (w[49], w[50], offset); + w[61] = amd_bytealign (w[48], w[49], offset); + w[60] = amd_bytealign (w[47], w[48], offset); + w[59] = amd_bytealign (w[46], w[47], offset); + w[58] = amd_bytealign (w[45], w[46], offset); + w[57] = amd_bytealign (w[44], w[45], offset); + w[56] = amd_bytealign (w[43], w[44], offset); + w[55] = amd_bytealign (w[42], w[43], offset); + w[54] = amd_bytealign (w[41], w[42], offset); + w[53] = amd_bytealign (w[40], w[41], offset); + w[52] = amd_bytealign (w[39], w[40], offset); + w[51] = amd_bytealign (w[38], w[39], offset); + w[50] = amd_bytealign (w[37], w[38], offset); + w[49] = amd_bytealign (w[36], w[37], offset); + w[48] = amd_bytealign (w[35], w[36], offset); + w[47] = amd_bytealign (w[34], w[35], offset); + w[46] = amd_bytealign (w[33], w[34], offset); + w[45] = amd_bytealign (w[32], w[33], offset); + w[44] = amd_bytealign (w[31], w[32], offset); + w[43] = amd_bytealign (w[30], w[31], offset); + w[42] = amd_bytealign (w[29], w[30], offset); + w[41] = amd_bytealign (w[28], w[29], offset); + w[40] = amd_bytealign (w[27], w[28], offset); + w[39] = amd_bytealign (w[26], w[27], offset); + w[38] = amd_bytealign (w[25], w[26], offset); + w[37] = amd_bytealign (w[24], w[25], offset); + w[36] = amd_bytealign (w[23], w[24], offset); + w[35] = amd_bytealign (w[22], w[23], offset); + w[34] = amd_bytealign (w[21], w[22], offset); + w[33] = amd_bytealign (w[20], w[21], offset); + w[32] = amd_bytealign (w[19], w[20], offset); + w[31] = amd_bytealign (w[18], w[19], offset); + w[30] = amd_bytealign (w[17], w[18], offset); + w[29] = amd_bytealign (w[16], w[17], offset); + w[28] = amd_bytealign (w[15], w[16], offset); + w[27] = amd_bytealign (w[14], w[15], offset); + w[26] = amd_bytealign (w[13], w[14], offset); + w[25] = amd_bytealign (w[12], w[13], offset); + w[24] = amd_bytealign (w[11], w[12], offset); + w[23] = amd_bytealign (w[10], w[11], offset); + w[22] = amd_bytealign (w[ 9], w[10], offset); + w[21] = amd_bytealign (w[ 8], w[ 9], offset); + w[20] = amd_bytealign (w[ 7], w[ 8], offset); + w[19] = amd_bytealign (w[ 6], w[ 7], offset); + w[18] = amd_bytealign (w[ 5], w[ 6], offset); + w[17] = amd_bytealign (w[ 4], w[ 5], offset); + w[16] = amd_bytealign (w[ 3], w[ 4], offset); + w[15] = amd_bytealign (w[ 2], w[ 3], offset); + w[14] = amd_bytealign (w[ 1], w[ 2], offset); + w[13] = amd_bytealign (w[ 0], w[ 1], offset); + w[12] = amd_bytealign ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = amd_bytealign (w[49], w[50], offset); + w[62] = amd_bytealign (w[48], w[49], offset); + w[61] = amd_bytealign (w[47], w[48], offset); + w[60] = amd_bytealign (w[46], w[47], offset); + w[59] = amd_bytealign (w[45], w[46], offset); + w[58] = amd_bytealign (w[44], w[45], offset); + w[57] = amd_bytealign (w[43], w[44], offset); + w[56] = amd_bytealign (w[42], w[43], offset); + w[55] = amd_bytealign (w[41], w[42], offset); + w[54] = amd_bytealign (w[40], w[41], offset); + w[53] = amd_bytealign (w[39], w[40], offset); + w[52] = amd_bytealign (w[38], w[39], offset); + w[51] = amd_bytealign (w[37], w[38], offset); + w[50] = amd_bytealign (w[36], w[37], offset); + w[49] = amd_bytealign (w[35], w[36], offset); + w[48] = amd_bytealign (w[34], w[35], offset); + w[47] = amd_bytealign (w[33], w[34], offset); + w[46] = amd_bytealign (w[32], w[33], offset); + w[45] = amd_bytealign (w[31], w[32], offset); + w[44] = amd_bytealign (w[30], w[31], offset); + w[43] = amd_bytealign (w[29], w[30], offset); + w[42] = amd_bytealign (w[28], w[29], offset); + w[41] = amd_bytealign (w[27], w[28], offset); + w[40] = amd_bytealign (w[26], w[27], offset); + w[39] = amd_bytealign (w[25], w[26], offset); + w[38] = amd_bytealign (w[24], w[25], offset); + w[37] = amd_bytealign (w[23], w[24], offset); + w[36] = amd_bytealign (w[22], w[23], offset); + w[35] = amd_bytealign (w[21], w[22], offset); + w[34] = amd_bytealign (w[20], w[21], offset); + w[33] = amd_bytealign (w[19], w[20], offset); + w[32] = amd_bytealign (w[18], w[19], offset); + w[31] = amd_bytealign (w[17], w[18], offset); + w[30] = amd_bytealign (w[16], w[17], offset); + w[29] = amd_bytealign (w[15], w[16], offset); + w[28] = amd_bytealign (w[14], w[15], offset); + w[27] = amd_bytealign (w[13], w[14], offset); + w[26] = amd_bytealign (w[12], w[13], offset); + w[25] = amd_bytealign (w[11], w[12], offset); + w[24] = amd_bytealign (w[10], w[11], offset); + w[23] = amd_bytealign (w[ 9], w[10], offset); + w[22] = amd_bytealign (w[ 8], w[ 9], offset); + w[21] = amd_bytealign (w[ 7], w[ 8], offset); + w[20] = amd_bytealign (w[ 6], w[ 7], offset); + w[19] = amd_bytealign (w[ 5], w[ 6], offset); + w[18] = amd_bytealign (w[ 4], w[ 5], offset); + w[17] = amd_bytealign (w[ 3], w[ 4], offset); + w[16] = amd_bytealign (w[ 2], w[ 3], offset); + w[15] = amd_bytealign (w[ 1], w[ 2], offset); + w[14] = amd_bytealign (w[ 0], w[ 1], offset); + w[13] = amd_bytealign ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = amd_bytealign (w[48], w[49], offset); + w[62] = amd_bytealign (w[47], w[48], offset); + w[61] = amd_bytealign (w[46], w[47], offset); + w[60] = amd_bytealign (w[45], w[46], offset); + w[59] = amd_bytealign (w[44], w[45], offset); + w[58] = amd_bytealign (w[43], w[44], offset); + w[57] = amd_bytealign (w[42], w[43], offset); + w[56] = amd_bytealign (w[41], w[42], offset); + w[55] = amd_bytealign (w[40], w[41], offset); + w[54] = amd_bytealign (w[39], w[40], offset); + w[53] = amd_bytealign (w[38], w[39], offset); + w[52] = amd_bytealign (w[37], w[38], offset); + w[51] = amd_bytealign (w[36], w[37], offset); + w[50] = amd_bytealign (w[35], w[36], offset); + w[49] = amd_bytealign (w[34], w[35], offset); + w[48] = amd_bytealign (w[33], w[34], offset); + w[47] = amd_bytealign (w[32], w[33], offset); + w[46] = amd_bytealign (w[31], w[32], offset); + w[45] = amd_bytealign (w[30], w[31], offset); + w[44] = amd_bytealign (w[29], w[30], offset); + w[43] = amd_bytealign (w[28], w[29], offset); + w[42] = amd_bytealign (w[27], w[28], offset); + w[41] = amd_bytealign (w[26], w[27], offset); + w[40] = amd_bytealign (w[25], w[26], offset); + w[39] = amd_bytealign (w[24], w[25], offset); + w[38] = amd_bytealign (w[23], w[24], offset); + w[37] = amd_bytealign (w[22], w[23], offset); + w[36] = amd_bytealign (w[21], w[22], offset); + w[35] = amd_bytealign (w[20], w[21], offset); + w[34] = amd_bytealign (w[19], w[20], offset); + w[33] = amd_bytealign (w[18], w[19], offset); + w[32] = amd_bytealign (w[17], w[18], offset); + w[31] = amd_bytealign (w[16], w[17], offset); + w[30] = amd_bytealign (w[15], w[16], offset); + w[29] = amd_bytealign (w[14], w[15], offset); + w[28] = amd_bytealign (w[13], w[14], offset); + w[27] = amd_bytealign (w[12], w[13], offset); + w[26] = amd_bytealign (w[11], w[12], offset); + w[25] = amd_bytealign (w[10], w[11], offset); + w[24] = amd_bytealign (w[ 9], w[10], offset); + w[23] = amd_bytealign (w[ 8], w[ 9], offset); + w[22] = amd_bytealign (w[ 7], w[ 8], offset); + w[21] = amd_bytealign (w[ 6], w[ 7], offset); + w[20] = amd_bytealign (w[ 5], w[ 6], offset); + w[19] = amd_bytealign (w[ 4], w[ 5], offset); + w[18] = amd_bytealign (w[ 3], w[ 4], offset); + w[17] = amd_bytealign (w[ 2], w[ 3], offset); + w[16] = amd_bytealign (w[ 1], w[ 2], offset); + w[15] = amd_bytealign (w[ 0], w[ 1], offset); + w[14] = amd_bytealign ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = amd_bytealign (w[47], w[48], offset); + w[62] = amd_bytealign (w[46], w[47], offset); + w[61] = amd_bytealign (w[45], w[46], offset); + w[60] = amd_bytealign (w[44], w[45], offset); + w[59] = amd_bytealign (w[43], w[44], offset); + w[58] = amd_bytealign (w[42], w[43], offset); + w[57] = amd_bytealign (w[41], w[42], offset); + w[56] = amd_bytealign (w[40], w[41], offset); + w[55] = amd_bytealign (w[39], w[40], offset); + w[54] = amd_bytealign (w[38], w[39], offset); + w[53] = amd_bytealign (w[37], w[38], offset); + w[52] = amd_bytealign (w[36], w[37], offset); + w[51] = amd_bytealign (w[35], w[36], offset); + w[50] = amd_bytealign (w[34], w[35], offset); + w[49] = amd_bytealign (w[33], w[34], offset); + w[48] = amd_bytealign (w[32], w[33], offset); + w[47] = amd_bytealign (w[31], w[32], offset); + w[46] = amd_bytealign (w[30], w[31], offset); + w[45] = amd_bytealign (w[29], w[30], offset); + w[44] = amd_bytealign (w[28], w[29], offset); + w[43] = amd_bytealign (w[27], w[28], offset); + w[42] = amd_bytealign (w[26], w[27], offset); + w[41] = amd_bytealign (w[25], w[26], offset); + w[40] = amd_bytealign (w[24], w[25], offset); + w[39] = amd_bytealign (w[23], w[24], offset); + w[38] = amd_bytealign (w[22], w[23], offset); + w[37] = amd_bytealign (w[21], w[22], offset); + w[36] = amd_bytealign (w[20], w[21], offset); + w[35] = amd_bytealign (w[19], w[20], offset); + w[34] = amd_bytealign (w[18], w[19], offset); + w[33] = amd_bytealign (w[17], w[18], offset); + w[32] = amd_bytealign (w[16], w[17], offset); + w[31] = amd_bytealign (w[15], w[16], offset); + w[30] = amd_bytealign (w[14], w[15], offset); + w[29] = amd_bytealign (w[13], w[14], offset); + w[28] = amd_bytealign (w[12], w[13], offset); + w[27] = amd_bytealign (w[11], w[12], offset); + w[26] = amd_bytealign (w[10], w[11], offset); + w[25] = amd_bytealign (w[ 9], w[10], offset); + w[24] = amd_bytealign (w[ 8], w[ 9], offset); + w[23] = amd_bytealign (w[ 7], w[ 8], offset); + w[22] = amd_bytealign (w[ 6], w[ 7], offset); + w[21] = amd_bytealign (w[ 5], w[ 6], offset); + w[20] = amd_bytealign (w[ 4], w[ 5], offset); + w[19] = amd_bytealign (w[ 3], w[ 4], offset); + w[18] = amd_bytealign (w[ 2], w[ 3], offset); + w[17] = amd_bytealign (w[ 1], w[ 2], offset); + w[16] = amd_bytealign (w[ 0], w[ 1], offset); + w[15] = amd_bytealign ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = amd_bytealign (w[46], w[47], offset); + w[62] = amd_bytealign (w[45], w[46], offset); + w[61] = amd_bytealign (w[44], w[45], offset); + w[60] = amd_bytealign (w[43], w[44], offset); + w[59] = amd_bytealign (w[42], w[43], offset); + w[58] = amd_bytealign (w[41], w[42], offset); + w[57] = amd_bytealign (w[40], w[41], offset); + w[56] = amd_bytealign (w[39], w[40], offset); + w[55] = amd_bytealign (w[38], w[39], offset); + w[54] = amd_bytealign (w[37], w[38], offset); + w[53] = amd_bytealign (w[36], w[37], offset); + w[52] = amd_bytealign (w[35], w[36], offset); + w[51] = amd_bytealign (w[34], w[35], offset); + w[50] = amd_bytealign (w[33], w[34], offset); + w[49] = amd_bytealign (w[32], w[33], offset); + w[48] = amd_bytealign (w[31], w[32], offset); + w[47] = amd_bytealign (w[30], w[31], offset); + w[46] = amd_bytealign (w[29], w[30], offset); + w[45] = amd_bytealign (w[28], w[29], offset); + w[44] = amd_bytealign (w[27], w[28], offset); + w[43] = amd_bytealign (w[26], w[27], offset); + w[42] = amd_bytealign (w[25], w[26], offset); + w[41] = amd_bytealign (w[24], w[25], offset); + w[40] = amd_bytealign (w[23], w[24], offset); + w[39] = amd_bytealign (w[22], w[23], offset); + w[38] = amd_bytealign (w[21], w[22], offset); + w[37] = amd_bytealign (w[20], w[21], offset); + w[36] = amd_bytealign (w[19], w[20], offset); + w[35] = amd_bytealign (w[18], w[19], offset); + w[34] = amd_bytealign (w[17], w[18], offset); + w[33] = amd_bytealign (w[16], w[17], offset); + w[32] = amd_bytealign (w[15], w[16], offset); + w[31] = amd_bytealign (w[14], w[15], offset); + w[30] = amd_bytealign (w[13], w[14], offset); + w[29] = amd_bytealign (w[12], w[13], offset); + w[28] = amd_bytealign (w[11], w[12], offset); + w[27] = amd_bytealign (w[10], w[11], offset); + w[26] = amd_bytealign (w[ 9], w[10], offset); + w[25] = amd_bytealign (w[ 8], w[ 9], offset); + w[24] = amd_bytealign (w[ 7], w[ 8], offset); + w[23] = amd_bytealign (w[ 6], w[ 7], offset); + w[22] = amd_bytealign (w[ 5], w[ 6], offset); + w[21] = amd_bytealign (w[ 4], w[ 5], offset); + w[20] = amd_bytealign (w[ 3], w[ 4], offset); + w[19] = amd_bytealign (w[ 2], w[ 3], offset); + w[18] = amd_bytealign (w[ 1], w[ 2], offset); + w[17] = amd_bytealign (w[ 0], w[ 1], offset); + w[16] = amd_bytealign ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = amd_bytealign (w[45], w[46], offset); + w[62] = amd_bytealign (w[44], w[45], offset); + w[61] = amd_bytealign (w[43], w[44], offset); + w[60] = amd_bytealign (w[42], w[43], offset); + w[59] = amd_bytealign (w[41], w[42], offset); + w[58] = amd_bytealign (w[40], w[41], offset); + w[57] = amd_bytealign (w[39], w[40], offset); + w[56] = amd_bytealign (w[38], w[39], offset); + w[55] = amd_bytealign (w[37], w[38], offset); + w[54] = amd_bytealign (w[36], w[37], offset); + w[53] = amd_bytealign (w[35], w[36], offset); + w[52] = amd_bytealign (w[34], w[35], offset); + w[51] = amd_bytealign (w[33], w[34], offset); + w[50] = amd_bytealign (w[32], w[33], offset); + w[49] = amd_bytealign (w[31], w[32], offset); + w[48] = amd_bytealign (w[30], w[31], offset); + w[47] = amd_bytealign (w[29], w[30], offset); + w[46] = amd_bytealign (w[28], w[29], offset); + w[45] = amd_bytealign (w[27], w[28], offset); + w[44] = amd_bytealign (w[26], w[27], offset); + w[43] = amd_bytealign (w[25], w[26], offset); + w[42] = amd_bytealign (w[24], w[25], offset); + w[41] = amd_bytealign (w[23], w[24], offset); + w[40] = amd_bytealign (w[22], w[23], offset); + w[39] = amd_bytealign (w[21], w[22], offset); + w[38] = amd_bytealign (w[20], w[21], offset); + w[37] = amd_bytealign (w[19], w[20], offset); + w[36] = amd_bytealign (w[18], w[19], offset); + w[35] = amd_bytealign (w[17], w[18], offset); + w[34] = amd_bytealign (w[16], w[17], offset); + w[33] = amd_bytealign (w[15], w[16], offset); + w[32] = amd_bytealign (w[14], w[15], offset); + w[31] = amd_bytealign (w[13], w[14], offset); + w[30] = amd_bytealign (w[12], w[13], offset); + w[29] = amd_bytealign (w[11], w[12], offset); + w[28] = amd_bytealign (w[10], w[11], offset); + w[27] = amd_bytealign (w[ 9], w[10], offset); + w[26] = amd_bytealign (w[ 8], w[ 9], offset); + w[25] = amd_bytealign (w[ 7], w[ 8], offset); + w[24] = amd_bytealign (w[ 6], w[ 7], offset); + w[23] = amd_bytealign (w[ 5], w[ 6], offset); + w[22] = amd_bytealign (w[ 4], w[ 5], offset); + w[21] = amd_bytealign (w[ 3], w[ 4], offset); + w[20] = amd_bytealign (w[ 2], w[ 3], offset); + w[19] = amd_bytealign (w[ 1], w[ 2], offset); + w[18] = amd_bytealign (w[ 0], w[ 1], offset); + w[17] = amd_bytealign ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = amd_bytealign (w[44], w[45], offset); + w[62] = amd_bytealign (w[43], w[44], offset); + w[61] = amd_bytealign (w[42], w[43], offset); + w[60] = amd_bytealign (w[41], w[42], offset); + w[59] = amd_bytealign (w[40], w[41], offset); + w[58] = amd_bytealign (w[39], w[40], offset); + w[57] = amd_bytealign (w[38], w[39], offset); + w[56] = amd_bytealign (w[37], w[38], offset); + w[55] = amd_bytealign (w[36], w[37], offset); + w[54] = amd_bytealign (w[35], w[36], offset); + w[53] = amd_bytealign (w[34], w[35], offset); + w[52] = amd_bytealign (w[33], w[34], offset); + w[51] = amd_bytealign (w[32], w[33], offset); + w[50] = amd_bytealign (w[31], w[32], offset); + w[49] = amd_bytealign (w[30], w[31], offset); + w[48] = amd_bytealign (w[29], w[30], offset); + w[47] = amd_bytealign (w[28], w[29], offset); + w[46] = amd_bytealign (w[27], w[28], offset); + w[45] = amd_bytealign (w[26], w[27], offset); + w[44] = amd_bytealign (w[25], w[26], offset); + w[43] = amd_bytealign (w[24], w[25], offset); + w[42] = amd_bytealign (w[23], w[24], offset); + w[41] = amd_bytealign (w[22], w[23], offset); + w[40] = amd_bytealign (w[21], w[22], offset); + w[39] = amd_bytealign (w[20], w[21], offset); + w[38] = amd_bytealign (w[19], w[20], offset); + w[37] = amd_bytealign (w[18], w[19], offset); + w[36] = amd_bytealign (w[17], w[18], offset); + w[35] = amd_bytealign (w[16], w[17], offset); + w[34] = amd_bytealign (w[15], w[16], offset); + w[33] = amd_bytealign (w[14], w[15], offset); + w[32] = amd_bytealign (w[13], w[14], offset); + w[31] = amd_bytealign (w[12], w[13], offset); + w[30] = amd_bytealign (w[11], w[12], offset); + w[29] = amd_bytealign (w[10], w[11], offset); + w[28] = amd_bytealign (w[ 9], w[10], offset); + w[27] = amd_bytealign (w[ 8], w[ 9], offset); + w[26] = amd_bytealign (w[ 7], w[ 8], offset); + w[25] = amd_bytealign (w[ 6], w[ 7], offset); + w[24] = amd_bytealign (w[ 5], w[ 6], offset); + w[23] = amd_bytealign (w[ 4], w[ 5], offset); + w[22] = amd_bytealign (w[ 3], w[ 4], offset); + w[21] = amd_bytealign (w[ 2], w[ 3], offset); + w[20] = amd_bytealign (w[ 1], w[ 2], offset); + w[19] = amd_bytealign (w[ 0], w[ 1], offset); + w[18] = amd_bytealign ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = amd_bytealign (w[43], w[44], offset); + w[62] = amd_bytealign (w[42], w[43], offset); + w[61] = amd_bytealign (w[41], w[42], offset); + w[60] = amd_bytealign (w[40], w[41], offset); + w[59] = amd_bytealign (w[39], w[40], offset); + w[58] = amd_bytealign (w[38], w[39], offset); + w[57] = amd_bytealign (w[37], w[38], offset); + w[56] = amd_bytealign (w[36], w[37], offset); + w[55] = amd_bytealign (w[35], w[36], offset); + w[54] = amd_bytealign (w[34], w[35], offset); + w[53] = amd_bytealign (w[33], w[34], offset); + w[52] = amd_bytealign (w[32], w[33], offset); + w[51] = amd_bytealign (w[31], w[32], offset); + w[50] = amd_bytealign (w[30], w[31], offset); + w[49] = amd_bytealign (w[29], w[30], offset); + w[48] = amd_bytealign (w[28], w[29], offset); + w[47] = amd_bytealign (w[27], w[28], offset); + w[46] = amd_bytealign (w[26], w[27], offset); + w[45] = amd_bytealign (w[25], w[26], offset); + w[44] = amd_bytealign (w[24], w[25], offset); + w[43] = amd_bytealign (w[23], w[24], offset); + w[42] = amd_bytealign (w[22], w[23], offset); + w[41] = amd_bytealign (w[21], w[22], offset); + w[40] = amd_bytealign (w[20], w[21], offset); + w[39] = amd_bytealign (w[19], w[20], offset); + w[38] = amd_bytealign (w[18], w[19], offset); + w[37] = amd_bytealign (w[17], w[18], offset); + w[36] = amd_bytealign (w[16], w[17], offset); + w[35] = amd_bytealign (w[15], w[16], offset); + w[34] = amd_bytealign (w[14], w[15], offset); + w[33] = amd_bytealign (w[13], w[14], offset); + w[32] = amd_bytealign (w[12], w[13], offset); + w[31] = amd_bytealign (w[11], w[12], offset); + w[30] = amd_bytealign (w[10], w[11], offset); + w[29] = amd_bytealign (w[ 9], w[10], offset); + w[28] = amd_bytealign (w[ 8], w[ 9], offset); + w[27] = amd_bytealign (w[ 7], w[ 8], offset); + w[26] = amd_bytealign (w[ 6], w[ 7], offset); + w[25] = amd_bytealign (w[ 5], w[ 6], offset); + w[24] = amd_bytealign (w[ 4], w[ 5], offset); + w[23] = amd_bytealign (w[ 3], w[ 4], offset); + w[22] = amd_bytealign (w[ 2], w[ 3], offset); + w[21] = amd_bytealign (w[ 1], w[ 2], offset); + w[20] = amd_bytealign (w[ 0], w[ 1], offset); + w[19] = amd_bytealign ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = amd_bytealign (w[42], w[43], offset); + w[62] = amd_bytealign (w[41], w[42], offset); + w[61] = amd_bytealign (w[40], w[41], offset); + w[60] = amd_bytealign (w[39], w[40], offset); + w[59] = amd_bytealign (w[38], w[39], offset); + w[58] = amd_bytealign (w[37], w[38], offset); + w[57] = amd_bytealign (w[36], w[37], offset); + w[56] = amd_bytealign (w[35], w[36], offset); + w[55] = amd_bytealign (w[34], w[35], offset); + w[54] = amd_bytealign (w[33], w[34], offset); + w[53] = amd_bytealign (w[32], w[33], offset); + w[52] = amd_bytealign (w[31], w[32], offset); + w[51] = amd_bytealign (w[30], w[31], offset); + w[50] = amd_bytealign (w[29], w[30], offset); + w[49] = amd_bytealign (w[28], w[29], offset); + w[48] = amd_bytealign (w[27], w[28], offset); + w[47] = amd_bytealign (w[26], w[27], offset); + w[46] = amd_bytealign (w[25], w[26], offset); + w[45] = amd_bytealign (w[24], w[25], offset); + w[44] = amd_bytealign (w[23], w[24], offset); + w[43] = amd_bytealign (w[22], w[23], offset); + w[42] = amd_bytealign (w[21], w[22], offset); + w[41] = amd_bytealign (w[20], w[21], offset); + w[40] = amd_bytealign (w[19], w[20], offset); + w[39] = amd_bytealign (w[18], w[19], offset); + w[38] = amd_bytealign (w[17], w[18], offset); + w[37] = amd_bytealign (w[16], w[17], offset); + w[36] = amd_bytealign (w[15], w[16], offset); + w[35] = amd_bytealign (w[14], w[15], offset); + w[34] = amd_bytealign (w[13], w[14], offset); + w[33] = amd_bytealign (w[12], w[13], offset); + w[32] = amd_bytealign (w[11], w[12], offset); + w[31] = amd_bytealign (w[10], w[11], offset); + w[30] = amd_bytealign (w[ 9], w[10], offset); + w[29] = amd_bytealign (w[ 8], w[ 9], offset); + w[28] = amd_bytealign (w[ 7], w[ 8], offset); + w[27] = amd_bytealign (w[ 6], w[ 7], offset); + w[26] = amd_bytealign (w[ 5], w[ 6], offset); + w[25] = amd_bytealign (w[ 4], w[ 5], offset); + w[24] = amd_bytealign (w[ 3], w[ 4], offset); + w[23] = amd_bytealign (w[ 2], w[ 3], offset); + w[22] = amd_bytealign (w[ 1], w[ 2], offset); + w[21] = amd_bytealign (w[ 0], w[ 1], offset); + w[20] = amd_bytealign ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = amd_bytealign (w[41], w[42], offset); + w[62] = amd_bytealign (w[40], w[41], offset); + w[61] = amd_bytealign (w[39], w[40], offset); + w[60] = amd_bytealign (w[38], w[39], offset); + w[59] = amd_bytealign (w[37], w[38], offset); + w[58] = amd_bytealign (w[36], w[37], offset); + w[57] = amd_bytealign (w[35], w[36], offset); + w[56] = amd_bytealign (w[34], w[35], offset); + w[55] = amd_bytealign (w[33], w[34], offset); + w[54] = amd_bytealign (w[32], w[33], offset); + w[53] = amd_bytealign (w[31], w[32], offset); + w[52] = amd_bytealign (w[30], w[31], offset); + w[51] = amd_bytealign (w[29], w[30], offset); + w[50] = amd_bytealign (w[28], w[29], offset); + w[49] = amd_bytealign (w[27], w[28], offset); + w[48] = amd_bytealign (w[26], w[27], offset); + w[47] = amd_bytealign (w[25], w[26], offset); + w[46] = amd_bytealign (w[24], w[25], offset); + w[45] = amd_bytealign (w[23], w[24], offset); + w[44] = amd_bytealign (w[22], w[23], offset); + w[43] = amd_bytealign (w[21], w[22], offset); + w[42] = amd_bytealign (w[20], w[21], offset); + w[41] = amd_bytealign (w[19], w[20], offset); + w[40] = amd_bytealign (w[18], w[19], offset); + w[39] = amd_bytealign (w[17], w[18], offset); + w[38] = amd_bytealign (w[16], w[17], offset); + w[37] = amd_bytealign (w[15], w[16], offset); + w[36] = amd_bytealign (w[14], w[15], offset); + w[35] = amd_bytealign (w[13], w[14], offset); + w[34] = amd_bytealign (w[12], w[13], offset); + w[33] = amd_bytealign (w[11], w[12], offset); + w[32] = amd_bytealign (w[10], w[11], offset); + w[31] = amd_bytealign (w[ 9], w[10], offset); + w[30] = amd_bytealign (w[ 8], w[ 9], offset); + w[29] = amd_bytealign (w[ 7], w[ 8], offset); + w[28] = amd_bytealign (w[ 6], w[ 7], offset); + w[27] = amd_bytealign (w[ 5], w[ 6], offset); + w[26] = amd_bytealign (w[ 4], w[ 5], offset); + w[25] = amd_bytealign (w[ 3], w[ 4], offset); + w[24] = amd_bytealign (w[ 2], w[ 3], offset); + w[23] = amd_bytealign (w[ 1], w[ 2], offset); + w[22] = amd_bytealign (w[ 0], w[ 1], offset); + w[21] = amd_bytealign ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = amd_bytealign (w[40], w[41], offset); + w[62] = amd_bytealign (w[39], w[40], offset); + w[61] = amd_bytealign (w[38], w[39], offset); + w[60] = amd_bytealign (w[37], w[38], offset); + w[59] = amd_bytealign (w[36], w[37], offset); + w[58] = amd_bytealign (w[35], w[36], offset); + w[57] = amd_bytealign (w[34], w[35], offset); + w[56] = amd_bytealign (w[33], w[34], offset); + w[55] = amd_bytealign (w[32], w[33], offset); + w[54] = amd_bytealign (w[31], w[32], offset); + w[53] = amd_bytealign (w[30], w[31], offset); + w[52] = amd_bytealign (w[29], w[30], offset); + w[51] = amd_bytealign (w[28], w[29], offset); + w[50] = amd_bytealign (w[27], w[28], offset); + w[49] = amd_bytealign (w[26], w[27], offset); + w[48] = amd_bytealign (w[25], w[26], offset); + w[47] = amd_bytealign (w[24], w[25], offset); + w[46] = amd_bytealign (w[23], w[24], offset); + w[45] = amd_bytealign (w[22], w[23], offset); + w[44] = amd_bytealign (w[21], w[22], offset); + w[43] = amd_bytealign (w[20], w[21], offset); + w[42] = amd_bytealign (w[19], w[20], offset); + w[41] = amd_bytealign (w[18], w[19], offset); + w[40] = amd_bytealign (w[17], w[18], offset); + w[39] = amd_bytealign (w[16], w[17], offset); + w[38] = amd_bytealign (w[15], w[16], offset); + w[37] = amd_bytealign (w[14], w[15], offset); + w[36] = amd_bytealign (w[13], w[14], offset); + w[35] = amd_bytealign (w[12], w[13], offset); + w[34] = amd_bytealign (w[11], w[12], offset); + w[33] = amd_bytealign (w[10], w[11], offset); + w[32] = amd_bytealign (w[ 9], w[10], offset); + w[31] = amd_bytealign (w[ 8], w[ 9], offset); + w[30] = amd_bytealign (w[ 7], w[ 8], offset); + w[29] = amd_bytealign (w[ 6], w[ 7], offset); + w[28] = amd_bytealign (w[ 5], w[ 6], offset); + w[27] = amd_bytealign (w[ 4], w[ 5], offset); + w[26] = amd_bytealign (w[ 3], w[ 4], offset); + w[25] = amd_bytealign (w[ 2], w[ 3], offset); + w[24] = amd_bytealign (w[ 1], w[ 2], offset); + w[23] = amd_bytealign (w[ 0], w[ 1], offset); + w[22] = amd_bytealign ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = amd_bytealign (w[39], w[40], offset); + w[62] = amd_bytealign (w[38], w[39], offset); + w[61] = amd_bytealign (w[37], w[38], offset); + w[60] = amd_bytealign (w[36], w[37], offset); + w[59] = amd_bytealign (w[35], w[36], offset); + w[58] = amd_bytealign (w[34], w[35], offset); + w[57] = amd_bytealign (w[33], w[34], offset); + w[56] = amd_bytealign (w[32], w[33], offset); + w[55] = amd_bytealign (w[31], w[32], offset); + w[54] = amd_bytealign (w[30], w[31], offset); + w[53] = amd_bytealign (w[29], w[30], offset); + w[52] = amd_bytealign (w[28], w[29], offset); + w[51] = amd_bytealign (w[27], w[28], offset); + w[50] = amd_bytealign (w[26], w[27], offset); + w[49] = amd_bytealign (w[25], w[26], offset); + w[48] = amd_bytealign (w[24], w[25], offset); + w[47] = amd_bytealign (w[23], w[24], offset); + w[46] = amd_bytealign (w[22], w[23], offset); + w[45] = amd_bytealign (w[21], w[22], offset); + w[44] = amd_bytealign (w[20], w[21], offset); + w[43] = amd_bytealign (w[19], w[20], offset); + w[42] = amd_bytealign (w[18], w[19], offset); + w[41] = amd_bytealign (w[17], w[18], offset); + w[40] = amd_bytealign (w[16], w[17], offset); + w[39] = amd_bytealign (w[15], w[16], offset); + w[38] = amd_bytealign (w[14], w[15], offset); + w[37] = amd_bytealign (w[13], w[14], offset); + w[36] = amd_bytealign (w[12], w[13], offset); + w[35] = amd_bytealign (w[11], w[12], offset); + w[34] = amd_bytealign (w[10], w[11], offset); + w[33] = amd_bytealign (w[ 9], w[10], offset); + w[32] = amd_bytealign (w[ 8], w[ 9], offset); + w[31] = amd_bytealign (w[ 7], w[ 8], offset); + w[30] = amd_bytealign (w[ 6], w[ 7], offset); + w[29] = amd_bytealign (w[ 5], w[ 6], offset); + w[28] = amd_bytealign (w[ 4], w[ 5], offset); + w[27] = amd_bytealign (w[ 3], w[ 4], offset); + w[26] = amd_bytealign (w[ 2], w[ 3], offset); + w[25] = amd_bytealign (w[ 1], w[ 2], offset); + w[24] = amd_bytealign (w[ 0], w[ 1], offset); + w[23] = amd_bytealign ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = amd_bytealign (w[38], w[39], offset); + w[62] = amd_bytealign (w[37], w[38], offset); + w[61] = amd_bytealign (w[36], w[37], offset); + w[60] = amd_bytealign (w[35], w[36], offset); + w[59] = amd_bytealign (w[34], w[35], offset); + w[58] = amd_bytealign (w[33], w[34], offset); + w[57] = amd_bytealign (w[32], w[33], offset); + w[56] = amd_bytealign (w[31], w[32], offset); + w[55] = amd_bytealign (w[30], w[31], offset); + w[54] = amd_bytealign (w[29], w[30], offset); + w[53] = amd_bytealign (w[28], w[29], offset); + w[52] = amd_bytealign (w[27], w[28], offset); + w[51] = amd_bytealign (w[26], w[27], offset); + w[50] = amd_bytealign (w[25], w[26], offset); + w[49] = amd_bytealign (w[24], w[25], offset); + w[48] = amd_bytealign (w[23], w[24], offset); + w[47] = amd_bytealign (w[22], w[23], offset); + w[46] = amd_bytealign (w[21], w[22], offset); + w[45] = amd_bytealign (w[20], w[21], offset); + w[44] = amd_bytealign (w[19], w[20], offset); + w[43] = amd_bytealign (w[18], w[19], offset); + w[42] = amd_bytealign (w[17], w[18], offset); + w[41] = amd_bytealign (w[16], w[17], offset); + w[40] = amd_bytealign (w[15], w[16], offset); + w[39] = amd_bytealign (w[14], w[15], offset); + w[38] = amd_bytealign (w[13], w[14], offset); + w[37] = amd_bytealign (w[12], w[13], offset); + w[36] = amd_bytealign (w[11], w[12], offset); + w[35] = amd_bytealign (w[10], w[11], offset); + w[34] = amd_bytealign (w[ 9], w[10], offset); + w[33] = amd_bytealign (w[ 8], w[ 9], offset); + w[32] = amd_bytealign (w[ 7], w[ 8], offset); + w[31] = amd_bytealign (w[ 6], w[ 7], offset); + w[30] = amd_bytealign (w[ 5], w[ 6], offset); + w[29] = amd_bytealign (w[ 4], w[ 5], offset); + w[28] = amd_bytealign (w[ 3], w[ 4], offset); + w[27] = amd_bytealign (w[ 2], w[ 3], offset); + w[26] = amd_bytealign (w[ 1], w[ 2], offset); + w[25] = amd_bytealign (w[ 0], w[ 1], offset); + w[24] = amd_bytealign ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = amd_bytealign (w[37], w[38], offset); + w[62] = amd_bytealign (w[36], w[37], offset); + w[61] = amd_bytealign (w[35], w[36], offset); + w[60] = amd_bytealign (w[34], w[35], offset); + w[59] = amd_bytealign (w[33], w[34], offset); + w[58] = amd_bytealign (w[32], w[33], offset); + w[57] = amd_bytealign (w[31], w[32], offset); + w[56] = amd_bytealign (w[30], w[31], offset); + w[55] = amd_bytealign (w[29], w[30], offset); + w[54] = amd_bytealign (w[28], w[29], offset); + w[53] = amd_bytealign (w[27], w[28], offset); + w[52] = amd_bytealign (w[26], w[27], offset); + w[51] = amd_bytealign (w[25], w[26], offset); + w[50] = amd_bytealign (w[24], w[25], offset); + w[49] = amd_bytealign (w[23], w[24], offset); + w[48] = amd_bytealign (w[22], w[23], offset); + w[47] = amd_bytealign (w[21], w[22], offset); + w[46] = amd_bytealign (w[20], w[21], offset); + w[45] = amd_bytealign (w[19], w[20], offset); + w[44] = amd_bytealign (w[18], w[19], offset); + w[43] = amd_bytealign (w[17], w[18], offset); + w[42] = amd_bytealign (w[16], w[17], offset); + w[41] = amd_bytealign (w[15], w[16], offset); + w[40] = amd_bytealign (w[14], w[15], offset); + w[39] = amd_bytealign (w[13], w[14], offset); + w[38] = amd_bytealign (w[12], w[13], offset); + w[37] = amd_bytealign (w[11], w[12], offset); + w[36] = amd_bytealign (w[10], w[11], offset); + w[35] = amd_bytealign (w[ 9], w[10], offset); + w[34] = amd_bytealign (w[ 8], w[ 9], offset); + w[33] = amd_bytealign (w[ 7], w[ 8], offset); + w[32] = amd_bytealign (w[ 6], w[ 7], offset); + w[31] = amd_bytealign (w[ 5], w[ 6], offset); + w[30] = amd_bytealign (w[ 4], w[ 5], offset); + w[29] = amd_bytealign (w[ 3], w[ 4], offset); + w[28] = amd_bytealign (w[ 2], w[ 3], offset); + w[27] = amd_bytealign (w[ 1], w[ 2], offset); + w[26] = amd_bytealign (w[ 0], w[ 1], offset); + w[25] = amd_bytealign ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = amd_bytealign (w[36], w[37], offset); + w[62] = amd_bytealign (w[35], w[36], offset); + w[61] = amd_bytealign (w[34], w[35], offset); + w[60] = amd_bytealign (w[33], w[34], offset); + w[59] = amd_bytealign (w[32], w[33], offset); + w[58] = amd_bytealign (w[31], w[32], offset); + w[57] = amd_bytealign (w[30], w[31], offset); + w[56] = amd_bytealign (w[29], w[30], offset); + w[55] = amd_bytealign (w[28], w[29], offset); + w[54] = amd_bytealign (w[27], w[28], offset); + w[53] = amd_bytealign (w[26], w[27], offset); + w[52] = amd_bytealign (w[25], w[26], offset); + w[51] = amd_bytealign (w[24], w[25], offset); + w[50] = amd_bytealign (w[23], w[24], offset); + w[49] = amd_bytealign (w[22], w[23], offset); + w[48] = amd_bytealign (w[21], w[22], offset); + w[47] = amd_bytealign (w[20], w[21], offset); + w[46] = amd_bytealign (w[19], w[20], offset); + w[45] = amd_bytealign (w[18], w[19], offset); + w[44] = amd_bytealign (w[17], w[18], offset); + w[43] = amd_bytealign (w[16], w[17], offset); + w[42] = amd_bytealign (w[15], w[16], offset); + w[41] = amd_bytealign (w[14], w[15], offset); + w[40] = amd_bytealign (w[13], w[14], offset); + w[39] = amd_bytealign (w[12], w[13], offset); + w[38] = amd_bytealign (w[11], w[12], offset); + w[37] = amd_bytealign (w[10], w[11], offset); + w[36] = amd_bytealign (w[ 9], w[10], offset); + w[35] = amd_bytealign (w[ 8], w[ 9], offset); + w[34] = amd_bytealign (w[ 7], w[ 8], offset); + w[33] = amd_bytealign (w[ 6], w[ 7], offset); + w[32] = amd_bytealign (w[ 5], w[ 6], offset); + w[31] = amd_bytealign (w[ 4], w[ 5], offset); + w[30] = amd_bytealign (w[ 3], w[ 4], offset); + w[29] = amd_bytealign (w[ 2], w[ 3], offset); + w[28] = amd_bytealign (w[ 1], w[ 2], offset); + w[27] = amd_bytealign (w[ 0], w[ 1], offset); + w[26] = amd_bytealign ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = amd_bytealign (w[35], w[36], offset); + w[62] = amd_bytealign (w[34], w[35], offset); + w[61] = amd_bytealign (w[33], w[34], offset); + w[60] = amd_bytealign (w[32], w[33], offset); + w[59] = amd_bytealign (w[31], w[32], offset); + w[58] = amd_bytealign (w[30], w[31], offset); + w[57] = amd_bytealign (w[29], w[30], offset); + w[56] = amd_bytealign (w[28], w[29], offset); + w[55] = amd_bytealign (w[27], w[28], offset); + w[54] = amd_bytealign (w[26], w[27], offset); + w[53] = amd_bytealign (w[25], w[26], offset); + w[52] = amd_bytealign (w[24], w[25], offset); + w[51] = amd_bytealign (w[23], w[24], offset); + w[50] = amd_bytealign (w[22], w[23], offset); + w[49] = amd_bytealign (w[21], w[22], offset); + w[48] = amd_bytealign (w[20], w[21], offset); + w[47] = amd_bytealign (w[19], w[20], offset); + w[46] = amd_bytealign (w[18], w[19], offset); + w[45] = amd_bytealign (w[17], w[18], offset); + w[44] = amd_bytealign (w[16], w[17], offset); + w[43] = amd_bytealign (w[15], w[16], offset); + w[42] = amd_bytealign (w[14], w[15], offset); + w[41] = amd_bytealign (w[13], w[14], offset); + w[40] = amd_bytealign (w[12], w[13], offset); + w[39] = amd_bytealign (w[11], w[12], offset); + w[38] = amd_bytealign (w[10], w[11], offset); + w[37] = amd_bytealign (w[ 9], w[10], offset); + w[36] = amd_bytealign (w[ 8], w[ 9], offset); + w[35] = amd_bytealign (w[ 7], w[ 8], offset); + w[34] = amd_bytealign (w[ 6], w[ 7], offset); + w[33] = amd_bytealign (w[ 5], w[ 6], offset); + w[32] = amd_bytealign (w[ 4], w[ 5], offset); + w[31] = amd_bytealign (w[ 3], w[ 4], offset); + w[30] = amd_bytealign (w[ 2], w[ 3], offset); + w[29] = amd_bytealign (w[ 1], w[ 2], offset); + w[28] = amd_bytealign (w[ 0], w[ 1], offset); + w[27] = amd_bytealign ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = amd_bytealign (w[34], w[35], offset); + w[62] = amd_bytealign (w[33], w[34], offset); + w[61] = amd_bytealign (w[32], w[33], offset); + w[60] = amd_bytealign (w[31], w[32], offset); + w[59] = amd_bytealign (w[30], w[31], offset); + w[58] = amd_bytealign (w[29], w[30], offset); + w[57] = amd_bytealign (w[28], w[29], offset); + w[56] = amd_bytealign (w[27], w[28], offset); + w[55] = amd_bytealign (w[26], w[27], offset); + w[54] = amd_bytealign (w[25], w[26], offset); + w[53] = amd_bytealign (w[24], w[25], offset); + w[52] = amd_bytealign (w[23], w[24], offset); + w[51] = amd_bytealign (w[22], w[23], offset); + w[50] = amd_bytealign (w[21], w[22], offset); + w[49] = amd_bytealign (w[20], w[21], offset); + w[48] = amd_bytealign (w[19], w[20], offset); + w[47] = amd_bytealign (w[18], w[19], offset); + w[46] = amd_bytealign (w[17], w[18], offset); + w[45] = amd_bytealign (w[16], w[17], offset); + w[44] = amd_bytealign (w[15], w[16], offset); + w[43] = amd_bytealign (w[14], w[15], offset); + w[42] = amd_bytealign (w[13], w[14], offset); + w[41] = amd_bytealign (w[12], w[13], offset); + w[40] = amd_bytealign (w[11], w[12], offset); + w[39] = amd_bytealign (w[10], w[11], offset); + w[38] = amd_bytealign (w[ 9], w[10], offset); + w[37] = amd_bytealign (w[ 8], w[ 9], offset); + w[36] = amd_bytealign (w[ 7], w[ 8], offset); + w[35] = amd_bytealign (w[ 6], w[ 7], offset); + w[34] = amd_bytealign (w[ 5], w[ 6], offset); + w[33] = amd_bytealign (w[ 4], w[ 5], offset); + w[32] = amd_bytealign (w[ 3], w[ 4], offset); + w[31] = amd_bytealign (w[ 2], w[ 3], offset); + w[30] = amd_bytealign (w[ 1], w[ 2], offset); + w[29] = amd_bytealign (w[ 0], w[ 1], offset); + w[28] = amd_bytealign ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = amd_bytealign (w[33], w[34], offset); + w[62] = amd_bytealign (w[32], w[33], offset); + w[61] = amd_bytealign (w[31], w[32], offset); + w[60] = amd_bytealign (w[30], w[31], offset); + w[59] = amd_bytealign (w[29], w[30], offset); + w[58] = amd_bytealign (w[28], w[29], offset); + w[57] = amd_bytealign (w[27], w[28], offset); + w[56] = amd_bytealign (w[26], w[27], offset); + w[55] = amd_bytealign (w[25], w[26], offset); + w[54] = amd_bytealign (w[24], w[25], offset); + w[53] = amd_bytealign (w[23], w[24], offset); + w[52] = amd_bytealign (w[22], w[23], offset); + w[51] = amd_bytealign (w[21], w[22], offset); + w[50] = amd_bytealign (w[20], w[21], offset); + w[49] = amd_bytealign (w[19], w[20], offset); + w[48] = amd_bytealign (w[18], w[19], offset); + w[47] = amd_bytealign (w[17], w[18], offset); + w[46] = amd_bytealign (w[16], w[17], offset); + w[45] = amd_bytealign (w[15], w[16], offset); + w[44] = amd_bytealign (w[14], w[15], offset); + w[43] = amd_bytealign (w[13], w[14], offset); + w[42] = amd_bytealign (w[12], w[13], offset); + w[41] = amd_bytealign (w[11], w[12], offset); + w[40] = amd_bytealign (w[10], w[11], offset); + w[39] = amd_bytealign (w[ 9], w[10], offset); + w[38] = amd_bytealign (w[ 8], w[ 9], offset); + w[37] = amd_bytealign (w[ 7], w[ 8], offset); + w[36] = amd_bytealign (w[ 6], w[ 7], offset); + w[35] = amd_bytealign (w[ 5], w[ 6], offset); + w[34] = amd_bytealign (w[ 4], w[ 5], offset); + w[33] = amd_bytealign (w[ 3], w[ 4], offset); + w[32] = amd_bytealign (w[ 2], w[ 3], offset); + w[31] = amd_bytealign (w[ 1], w[ 2], offset); + w[30] = amd_bytealign (w[ 0], w[ 1], offset); + w[29] = amd_bytealign ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = amd_bytealign (w[32], w[33], offset); + w[62] = amd_bytealign (w[31], w[32], offset); + w[61] = amd_bytealign (w[30], w[31], offset); + w[60] = amd_bytealign (w[29], w[30], offset); + w[59] = amd_bytealign (w[28], w[29], offset); + w[58] = amd_bytealign (w[27], w[28], offset); + w[57] = amd_bytealign (w[26], w[27], offset); + w[56] = amd_bytealign (w[25], w[26], offset); + w[55] = amd_bytealign (w[24], w[25], offset); + w[54] = amd_bytealign (w[23], w[24], offset); + w[53] = amd_bytealign (w[22], w[23], offset); + w[52] = amd_bytealign (w[21], w[22], offset); + w[51] = amd_bytealign (w[20], w[21], offset); + w[50] = amd_bytealign (w[19], w[20], offset); + w[49] = amd_bytealign (w[18], w[19], offset); + w[48] = amd_bytealign (w[17], w[18], offset); + w[47] = amd_bytealign (w[16], w[17], offset); + w[46] = amd_bytealign (w[15], w[16], offset); + w[45] = amd_bytealign (w[14], w[15], offset); + w[44] = amd_bytealign (w[13], w[14], offset); + w[43] = amd_bytealign (w[12], w[13], offset); + w[42] = amd_bytealign (w[11], w[12], offset); + w[41] = amd_bytealign (w[10], w[11], offset); + w[40] = amd_bytealign (w[ 9], w[10], offset); + w[39] = amd_bytealign (w[ 8], w[ 9], offset); + w[38] = amd_bytealign (w[ 7], w[ 8], offset); + w[37] = amd_bytealign (w[ 6], w[ 7], offset); + w[36] = amd_bytealign (w[ 5], w[ 6], offset); + w[35] = amd_bytealign (w[ 4], w[ 5], offset); + w[34] = amd_bytealign (w[ 3], w[ 4], offset); + w[33] = amd_bytealign (w[ 2], w[ 3], offset); + w[32] = amd_bytealign (w[ 1], w[ 2], offset); + w[31] = amd_bytealign (w[ 0], w[ 1], offset); + w[30] = amd_bytealign ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = amd_bytealign (w[31], w[32], offset); + w[62] = amd_bytealign (w[30], w[31], offset); + w[61] = amd_bytealign (w[29], w[30], offset); + w[60] = amd_bytealign (w[28], w[29], offset); + w[59] = amd_bytealign (w[27], w[28], offset); + w[58] = amd_bytealign (w[26], w[27], offset); + w[57] = amd_bytealign (w[25], w[26], offset); + w[56] = amd_bytealign (w[24], w[25], offset); + w[55] = amd_bytealign (w[23], w[24], offset); + w[54] = amd_bytealign (w[22], w[23], offset); + w[53] = amd_bytealign (w[21], w[22], offset); + w[52] = amd_bytealign (w[20], w[21], offset); + w[51] = amd_bytealign (w[19], w[20], offset); + w[50] = amd_bytealign (w[18], w[19], offset); + w[49] = amd_bytealign (w[17], w[18], offset); + w[48] = amd_bytealign (w[16], w[17], offset); + w[47] = amd_bytealign (w[15], w[16], offset); + w[46] = amd_bytealign (w[14], w[15], offset); + w[45] = amd_bytealign (w[13], w[14], offset); + w[44] = amd_bytealign (w[12], w[13], offset); + w[43] = amd_bytealign (w[11], w[12], offset); + w[42] = amd_bytealign (w[10], w[11], offset); + w[41] = amd_bytealign (w[ 9], w[10], offset); + w[40] = amd_bytealign (w[ 8], w[ 9], offset); + w[39] = amd_bytealign (w[ 7], w[ 8], offset); + w[38] = amd_bytealign (w[ 6], w[ 7], offset); + w[37] = amd_bytealign (w[ 5], w[ 6], offset); + w[36] = amd_bytealign (w[ 4], w[ 5], offset); + w[35] = amd_bytealign (w[ 3], w[ 4], offset); + w[34] = amd_bytealign (w[ 2], w[ 3], offset); + w[33] = amd_bytealign (w[ 1], w[ 2], offset); + w[32] = amd_bytealign (w[ 0], w[ 1], offset); + w[31] = amd_bytealign ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = amd_bytealign (w[30], w[31], offset); + w[62] = amd_bytealign (w[29], w[30], offset); + w[61] = amd_bytealign (w[28], w[29], offset); + w[60] = amd_bytealign (w[27], w[28], offset); + w[59] = amd_bytealign (w[26], w[27], offset); + w[58] = amd_bytealign (w[25], w[26], offset); + w[57] = amd_bytealign (w[24], w[25], offset); + w[56] = amd_bytealign (w[23], w[24], offset); + w[55] = amd_bytealign (w[22], w[23], offset); + w[54] = amd_bytealign (w[21], w[22], offset); + w[53] = amd_bytealign (w[20], w[21], offset); + w[52] = amd_bytealign (w[19], w[20], offset); + w[51] = amd_bytealign (w[18], w[19], offset); + w[50] = amd_bytealign (w[17], w[18], offset); + w[49] = amd_bytealign (w[16], w[17], offset); + w[48] = amd_bytealign (w[15], w[16], offset); + w[47] = amd_bytealign (w[14], w[15], offset); + w[46] = amd_bytealign (w[13], w[14], offset); + w[45] = amd_bytealign (w[12], w[13], offset); + w[44] = amd_bytealign (w[11], w[12], offset); + w[43] = amd_bytealign (w[10], w[11], offset); + w[42] = amd_bytealign (w[ 9], w[10], offset); + w[41] = amd_bytealign (w[ 8], w[ 9], offset); + w[40] = amd_bytealign (w[ 7], w[ 8], offset); + w[39] = amd_bytealign (w[ 6], w[ 7], offset); + w[38] = amd_bytealign (w[ 5], w[ 6], offset); + w[37] = amd_bytealign (w[ 4], w[ 5], offset); + w[36] = amd_bytealign (w[ 3], w[ 4], offset); + w[35] = amd_bytealign (w[ 2], w[ 3], offset); + w[34] = amd_bytealign (w[ 1], w[ 2], offset); + w[33] = amd_bytealign (w[ 0], w[ 1], offset); + w[32] = amd_bytealign ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = amd_bytealign (w[29], w[30], offset); + w[62] = amd_bytealign (w[28], w[29], offset); + w[61] = amd_bytealign (w[27], w[28], offset); + w[60] = amd_bytealign (w[26], w[27], offset); + w[59] = amd_bytealign (w[25], w[26], offset); + w[58] = amd_bytealign (w[24], w[25], offset); + w[57] = amd_bytealign (w[23], w[24], offset); + w[56] = amd_bytealign (w[22], w[23], offset); + w[55] = amd_bytealign (w[21], w[22], offset); + w[54] = amd_bytealign (w[20], w[21], offset); + w[53] = amd_bytealign (w[19], w[20], offset); + w[52] = amd_bytealign (w[18], w[19], offset); + w[51] = amd_bytealign (w[17], w[18], offset); + w[50] = amd_bytealign (w[16], w[17], offset); + w[49] = amd_bytealign (w[15], w[16], offset); + w[48] = amd_bytealign (w[14], w[15], offset); + w[47] = amd_bytealign (w[13], w[14], offset); + w[46] = amd_bytealign (w[12], w[13], offset); + w[45] = amd_bytealign (w[11], w[12], offset); + w[44] = amd_bytealign (w[10], w[11], offset); + w[43] = amd_bytealign (w[ 9], w[10], offset); + w[42] = amd_bytealign (w[ 8], w[ 9], offset); + w[41] = amd_bytealign (w[ 7], w[ 8], offset); + w[40] = amd_bytealign (w[ 6], w[ 7], offset); + w[39] = amd_bytealign (w[ 5], w[ 6], offset); + w[38] = amd_bytealign (w[ 4], w[ 5], offset); + w[37] = amd_bytealign (w[ 3], w[ 4], offset); + w[36] = amd_bytealign (w[ 2], w[ 3], offset); + w[35] = amd_bytealign (w[ 1], w[ 2], offset); + w[34] = amd_bytealign (w[ 0], w[ 1], offset); + w[33] = amd_bytealign ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = amd_bytealign (w[28], w[29], offset); + w[62] = amd_bytealign (w[27], w[28], offset); + w[61] = amd_bytealign (w[26], w[27], offset); + w[60] = amd_bytealign (w[25], w[26], offset); + w[59] = amd_bytealign (w[24], w[25], offset); + w[58] = amd_bytealign (w[23], w[24], offset); + w[57] = amd_bytealign (w[22], w[23], offset); + w[56] = amd_bytealign (w[21], w[22], offset); + w[55] = amd_bytealign (w[20], w[21], offset); + w[54] = amd_bytealign (w[19], w[20], offset); + w[53] = amd_bytealign (w[18], w[19], offset); + w[52] = amd_bytealign (w[17], w[18], offset); + w[51] = amd_bytealign (w[16], w[17], offset); + w[50] = amd_bytealign (w[15], w[16], offset); + w[49] = amd_bytealign (w[14], w[15], offset); + w[48] = amd_bytealign (w[13], w[14], offset); + w[47] = amd_bytealign (w[12], w[13], offset); + w[46] = amd_bytealign (w[11], w[12], offset); + w[45] = amd_bytealign (w[10], w[11], offset); + w[44] = amd_bytealign (w[ 9], w[10], offset); + w[43] = amd_bytealign (w[ 8], w[ 9], offset); + w[42] = amd_bytealign (w[ 7], w[ 8], offset); + w[41] = amd_bytealign (w[ 6], w[ 7], offset); + w[40] = amd_bytealign (w[ 5], w[ 6], offset); + w[39] = amd_bytealign (w[ 4], w[ 5], offset); + w[38] = amd_bytealign (w[ 3], w[ 4], offset); + w[37] = amd_bytealign (w[ 2], w[ 3], offset); + w[36] = amd_bytealign (w[ 1], w[ 2], offset); + w[35] = amd_bytealign (w[ 0], w[ 1], offset); + w[34] = amd_bytealign ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = amd_bytealign (w[27], w[28], offset); + w[62] = amd_bytealign (w[26], w[27], offset); + w[61] = amd_bytealign (w[25], w[26], offset); + w[60] = amd_bytealign (w[24], w[25], offset); + w[59] = amd_bytealign (w[23], w[24], offset); + w[58] = amd_bytealign (w[22], w[23], offset); + w[57] = amd_bytealign (w[21], w[22], offset); + w[56] = amd_bytealign (w[20], w[21], offset); + w[55] = amd_bytealign (w[19], w[20], offset); + w[54] = amd_bytealign (w[18], w[19], offset); + w[53] = amd_bytealign (w[17], w[18], offset); + w[52] = amd_bytealign (w[16], w[17], offset); + w[51] = amd_bytealign (w[15], w[16], offset); + w[50] = amd_bytealign (w[14], w[15], offset); + w[49] = amd_bytealign (w[13], w[14], offset); + w[48] = amd_bytealign (w[12], w[13], offset); + w[47] = amd_bytealign (w[11], w[12], offset); + w[46] = amd_bytealign (w[10], w[11], offset); + w[45] = amd_bytealign (w[ 9], w[10], offset); + w[44] = amd_bytealign (w[ 8], w[ 9], offset); + w[43] = amd_bytealign (w[ 7], w[ 8], offset); + w[42] = amd_bytealign (w[ 6], w[ 7], offset); + w[41] = amd_bytealign (w[ 5], w[ 6], offset); + w[40] = amd_bytealign (w[ 4], w[ 5], offset); + w[39] = amd_bytealign (w[ 3], w[ 4], offset); + w[38] = amd_bytealign (w[ 2], w[ 3], offset); + w[37] = amd_bytealign (w[ 1], w[ 2], offset); + w[36] = amd_bytealign (w[ 0], w[ 1], offset); + w[35] = amd_bytealign ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = amd_bytealign (w[26], w[27], offset); + w[62] = amd_bytealign (w[25], w[26], offset); + w[61] = amd_bytealign (w[24], w[25], offset); + w[60] = amd_bytealign (w[23], w[24], offset); + w[59] = amd_bytealign (w[22], w[23], offset); + w[58] = amd_bytealign (w[21], w[22], offset); + w[57] = amd_bytealign (w[20], w[21], offset); + w[56] = amd_bytealign (w[19], w[20], offset); + w[55] = amd_bytealign (w[18], w[19], offset); + w[54] = amd_bytealign (w[17], w[18], offset); + w[53] = amd_bytealign (w[16], w[17], offset); + w[52] = amd_bytealign (w[15], w[16], offset); + w[51] = amd_bytealign (w[14], w[15], offset); + w[50] = amd_bytealign (w[13], w[14], offset); + w[49] = amd_bytealign (w[12], w[13], offset); + w[48] = amd_bytealign (w[11], w[12], offset); + w[47] = amd_bytealign (w[10], w[11], offset); + w[46] = amd_bytealign (w[ 9], w[10], offset); + w[45] = amd_bytealign (w[ 8], w[ 9], offset); + w[44] = amd_bytealign (w[ 7], w[ 8], offset); + w[43] = amd_bytealign (w[ 6], w[ 7], offset); + w[42] = amd_bytealign (w[ 5], w[ 6], offset); + w[41] = amd_bytealign (w[ 4], w[ 5], offset); + w[40] = amd_bytealign (w[ 3], w[ 4], offset); + w[39] = amd_bytealign (w[ 2], w[ 3], offset); + w[38] = amd_bytealign (w[ 1], w[ 2], offset); + w[37] = amd_bytealign (w[ 0], w[ 1], offset); + w[36] = amd_bytealign ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = amd_bytealign (w[25], w[26], offset); + w[62] = amd_bytealign (w[24], w[25], offset); + w[61] = amd_bytealign (w[23], w[24], offset); + w[60] = amd_bytealign (w[22], w[23], offset); + w[59] = amd_bytealign (w[21], w[22], offset); + w[58] = amd_bytealign (w[20], w[21], offset); + w[57] = amd_bytealign (w[19], w[20], offset); + w[56] = amd_bytealign (w[18], w[19], offset); + w[55] = amd_bytealign (w[17], w[18], offset); + w[54] = amd_bytealign (w[16], w[17], offset); + w[53] = amd_bytealign (w[15], w[16], offset); + w[52] = amd_bytealign (w[14], w[15], offset); + w[51] = amd_bytealign (w[13], w[14], offset); + w[50] = amd_bytealign (w[12], w[13], offset); + w[49] = amd_bytealign (w[11], w[12], offset); + w[48] = amd_bytealign (w[10], w[11], offset); + w[47] = amd_bytealign (w[ 9], w[10], offset); + w[46] = amd_bytealign (w[ 8], w[ 9], offset); + w[45] = amd_bytealign (w[ 7], w[ 8], offset); + w[44] = amd_bytealign (w[ 6], w[ 7], offset); + w[43] = amd_bytealign (w[ 5], w[ 6], offset); + w[42] = amd_bytealign (w[ 4], w[ 5], offset); + w[41] = amd_bytealign (w[ 3], w[ 4], offset); + w[40] = amd_bytealign (w[ 2], w[ 3], offset); + w[39] = amd_bytealign (w[ 1], w[ 2], offset); + w[38] = amd_bytealign (w[ 0], w[ 1], offset); + w[37] = amd_bytealign ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = amd_bytealign (w[24], w[25], offset); + w[62] = amd_bytealign (w[23], w[24], offset); + w[61] = amd_bytealign (w[22], w[23], offset); + w[60] = amd_bytealign (w[21], w[22], offset); + w[59] = amd_bytealign (w[20], w[21], offset); + w[58] = amd_bytealign (w[19], w[20], offset); + w[57] = amd_bytealign (w[18], w[19], offset); + w[56] = amd_bytealign (w[17], w[18], offset); + w[55] = amd_bytealign (w[16], w[17], offset); + w[54] = amd_bytealign (w[15], w[16], offset); + w[53] = amd_bytealign (w[14], w[15], offset); + w[52] = amd_bytealign (w[13], w[14], offset); + w[51] = amd_bytealign (w[12], w[13], offset); + w[50] = amd_bytealign (w[11], w[12], offset); + w[49] = amd_bytealign (w[10], w[11], offset); + w[48] = amd_bytealign (w[ 9], w[10], offset); + w[47] = amd_bytealign (w[ 8], w[ 9], offset); + w[46] = amd_bytealign (w[ 7], w[ 8], offset); + w[45] = amd_bytealign (w[ 6], w[ 7], offset); + w[44] = amd_bytealign (w[ 5], w[ 6], offset); + w[43] = amd_bytealign (w[ 4], w[ 5], offset); + w[42] = amd_bytealign (w[ 3], w[ 4], offset); + w[41] = amd_bytealign (w[ 2], w[ 3], offset); + w[40] = amd_bytealign (w[ 1], w[ 2], offset); + w[39] = amd_bytealign (w[ 0], w[ 1], offset); + w[38] = amd_bytealign ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = amd_bytealign (w[23], w[24], offset); + w[62] = amd_bytealign (w[22], w[23], offset); + w[61] = amd_bytealign (w[21], w[22], offset); + w[60] = amd_bytealign (w[20], w[21], offset); + w[59] = amd_bytealign (w[19], w[20], offset); + w[58] = amd_bytealign (w[18], w[19], offset); + w[57] = amd_bytealign (w[17], w[18], offset); + w[56] = amd_bytealign (w[16], w[17], offset); + w[55] = amd_bytealign (w[15], w[16], offset); + w[54] = amd_bytealign (w[14], w[15], offset); + w[53] = amd_bytealign (w[13], w[14], offset); + w[52] = amd_bytealign (w[12], w[13], offset); + w[51] = amd_bytealign (w[11], w[12], offset); + w[50] = amd_bytealign (w[10], w[11], offset); + w[49] = amd_bytealign (w[ 9], w[10], offset); + w[48] = amd_bytealign (w[ 8], w[ 9], offset); + w[47] = amd_bytealign (w[ 7], w[ 8], offset); + w[46] = amd_bytealign (w[ 6], w[ 7], offset); + w[45] = amd_bytealign (w[ 5], w[ 6], offset); + w[44] = amd_bytealign (w[ 4], w[ 5], offset); + w[43] = amd_bytealign (w[ 3], w[ 4], offset); + w[42] = amd_bytealign (w[ 2], w[ 3], offset); + w[41] = amd_bytealign (w[ 1], w[ 2], offset); + w[40] = amd_bytealign (w[ 0], w[ 1], offset); + w[39] = amd_bytealign ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = amd_bytealign (w[22], w[23], offset); + w[62] = amd_bytealign (w[21], w[22], offset); + w[61] = amd_bytealign (w[20], w[21], offset); + w[60] = amd_bytealign (w[19], w[20], offset); + w[59] = amd_bytealign (w[18], w[19], offset); + w[58] = amd_bytealign (w[17], w[18], offset); + w[57] = amd_bytealign (w[16], w[17], offset); + w[56] = amd_bytealign (w[15], w[16], offset); + w[55] = amd_bytealign (w[14], w[15], offset); + w[54] = amd_bytealign (w[13], w[14], offset); + w[53] = amd_bytealign (w[12], w[13], offset); + w[52] = amd_bytealign (w[11], w[12], offset); + w[51] = amd_bytealign (w[10], w[11], offset); + w[50] = amd_bytealign (w[ 9], w[10], offset); + w[49] = amd_bytealign (w[ 8], w[ 9], offset); + w[48] = amd_bytealign (w[ 7], w[ 8], offset); + w[47] = amd_bytealign (w[ 6], w[ 7], offset); + w[46] = amd_bytealign (w[ 5], w[ 6], offset); + w[45] = amd_bytealign (w[ 4], w[ 5], offset); + w[44] = amd_bytealign (w[ 3], w[ 4], offset); + w[43] = amd_bytealign (w[ 2], w[ 3], offset); + w[42] = amd_bytealign (w[ 1], w[ 2], offset); + w[41] = amd_bytealign (w[ 0], w[ 1], offset); + w[40] = amd_bytealign ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = amd_bytealign (w[21], w[22], offset); + w[62] = amd_bytealign (w[20], w[21], offset); + w[61] = amd_bytealign (w[19], w[20], offset); + w[60] = amd_bytealign (w[18], w[19], offset); + w[59] = amd_bytealign (w[17], w[18], offset); + w[58] = amd_bytealign (w[16], w[17], offset); + w[57] = amd_bytealign (w[15], w[16], offset); + w[56] = amd_bytealign (w[14], w[15], offset); + w[55] = amd_bytealign (w[13], w[14], offset); + w[54] = amd_bytealign (w[12], w[13], offset); + w[53] = amd_bytealign (w[11], w[12], offset); + w[52] = amd_bytealign (w[10], w[11], offset); + w[51] = amd_bytealign (w[ 9], w[10], offset); + w[50] = amd_bytealign (w[ 8], w[ 9], offset); + w[49] = amd_bytealign (w[ 7], w[ 8], offset); + w[48] = amd_bytealign (w[ 6], w[ 7], offset); + w[47] = amd_bytealign (w[ 5], w[ 6], offset); + w[46] = amd_bytealign (w[ 4], w[ 5], offset); + w[45] = amd_bytealign (w[ 3], w[ 4], offset); + w[44] = amd_bytealign (w[ 2], w[ 3], offset); + w[43] = amd_bytealign (w[ 1], w[ 2], offset); + w[42] = amd_bytealign (w[ 0], w[ 1], offset); + w[41] = amd_bytealign ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = amd_bytealign (w[20], w[21], offset); + w[62] = amd_bytealign (w[19], w[20], offset); + w[61] = amd_bytealign (w[18], w[19], offset); + w[60] = amd_bytealign (w[17], w[18], offset); + w[59] = amd_bytealign (w[16], w[17], offset); + w[58] = amd_bytealign (w[15], w[16], offset); + w[57] = amd_bytealign (w[14], w[15], offset); + w[56] = amd_bytealign (w[13], w[14], offset); + w[55] = amd_bytealign (w[12], w[13], offset); + w[54] = amd_bytealign (w[11], w[12], offset); + w[53] = amd_bytealign (w[10], w[11], offset); + w[52] = amd_bytealign (w[ 9], w[10], offset); + w[51] = amd_bytealign (w[ 8], w[ 9], offset); + w[50] = amd_bytealign (w[ 7], w[ 8], offset); + w[49] = amd_bytealign (w[ 6], w[ 7], offset); + w[48] = amd_bytealign (w[ 5], w[ 6], offset); + w[47] = amd_bytealign (w[ 4], w[ 5], offset); + w[46] = amd_bytealign (w[ 3], w[ 4], offset); + w[45] = amd_bytealign (w[ 2], w[ 3], offset); + w[44] = amd_bytealign (w[ 1], w[ 2], offset); + w[43] = amd_bytealign (w[ 0], w[ 1], offset); + w[42] = amd_bytealign ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = amd_bytealign (w[19], w[20], offset); + w[62] = amd_bytealign (w[18], w[19], offset); + w[61] = amd_bytealign (w[17], w[18], offset); + w[60] = amd_bytealign (w[16], w[17], offset); + w[59] = amd_bytealign (w[15], w[16], offset); + w[58] = amd_bytealign (w[14], w[15], offset); + w[57] = amd_bytealign (w[13], w[14], offset); + w[56] = amd_bytealign (w[12], w[13], offset); + w[55] = amd_bytealign (w[11], w[12], offset); + w[54] = amd_bytealign (w[10], w[11], offset); + w[53] = amd_bytealign (w[ 9], w[10], offset); + w[52] = amd_bytealign (w[ 8], w[ 9], offset); + w[51] = amd_bytealign (w[ 7], w[ 8], offset); + w[50] = amd_bytealign (w[ 6], w[ 7], offset); + w[49] = amd_bytealign (w[ 5], w[ 6], offset); + w[48] = amd_bytealign (w[ 4], w[ 5], offset); + w[47] = amd_bytealign (w[ 3], w[ 4], offset); + w[46] = amd_bytealign (w[ 2], w[ 3], offset); + w[45] = amd_bytealign (w[ 1], w[ 2], offset); + w[44] = amd_bytealign (w[ 0], w[ 1], offset); + w[43] = amd_bytealign ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = amd_bytealign (w[18], w[19], offset); + w[62] = amd_bytealign (w[17], w[18], offset); + w[61] = amd_bytealign (w[16], w[17], offset); + w[60] = amd_bytealign (w[15], w[16], offset); + w[59] = amd_bytealign (w[14], w[15], offset); + w[58] = amd_bytealign (w[13], w[14], offset); + w[57] = amd_bytealign (w[12], w[13], offset); + w[56] = amd_bytealign (w[11], w[12], offset); + w[55] = amd_bytealign (w[10], w[11], offset); + w[54] = amd_bytealign (w[ 9], w[10], offset); + w[53] = amd_bytealign (w[ 8], w[ 9], offset); + w[52] = amd_bytealign (w[ 7], w[ 8], offset); + w[51] = amd_bytealign (w[ 6], w[ 7], offset); + w[50] = amd_bytealign (w[ 5], w[ 6], offset); + w[49] = amd_bytealign (w[ 4], w[ 5], offset); + w[48] = amd_bytealign (w[ 3], w[ 4], offset); + w[47] = amd_bytealign (w[ 2], w[ 3], offset); + w[46] = amd_bytealign (w[ 1], w[ 2], offset); + w[45] = amd_bytealign (w[ 0], w[ 1], offset); + w[44] = amd_bytealign ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = amd_bytealign (w[17], w[18], offset); + w[62] = amd_bytealign (w[16], w[17], offset); + w[61] = amd_bytealign (w[15], w[16], offset); + w[60] = amd_bytealign (w[14], w[15], offset); + w[59] = amd_bytealign (w[13], w[14], offset); + w[58] = amd_bytealign (w[12], w[13], offset); + w[57] = amd_bytealign (w[11], w[12], offset); + w[56] = amd_bytealign (w[10], w[11], offset); + w[55] = amd_bytealign (w[ 9], w[10], offset); + w[54] = amd_bytealign (w[ 8], w[ 9], offset); + w[53] = amd_bytealign (w[ 7], w[ 8], offset); + w[52] = amd_bytealign (w[ 6], w[ 7], offset); + w[51] = amd_bytealign (w[ 5], w[ 6], offset); + w[50] = amd_bytealign (w[ 4], w[ 5], offset); + w[49] = amd_bytealign (w[ 3], w[ 4], offset); + w[48] = amd_bytealign (w[ 2], w[ 3], offset); + w[47] = amd_bytealign (w[ 1], w[ 2], offset); + w[46] = amd_bytealign (w[ 0], w[ 1], offset); + w[45] = amd_bytealign ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = amd_bytealign (w[16], w[17], offset); + w[62] = amd_bytealign (w[15], w[16], offset); + w[61] = amd_bytealign (w[14], w[15], offset); + w[60] = amd_bytealign (w[13], w[14], offset); + w[59] = amd_bytealign (w[12], w[13], offset); + w[58] = amd_bytealign (w[11], w[12], offset); + w[57] = amd_bytealign (w[10], w[11], offset); + w[56] = amd_bytealign (w[ 9], w[10], offset); + w[55] = amd_bytealign (w[ 8], w[ 9], offset); + w[54] = amd_bytealign (w[ 7], w[ 8], offset); + w[53] = amd_bytealign (w[ 6], w[ 7], offset); + w[52] = amd_bytealign (w[ 5], w[ 6], offset); + w[51] = amd_bytealign (w[ 4], w[ 5], offset); + w[50] = amd_bytealign (w[ 3], w[ 4], offset); + w[49] = amd_bytealign (w[ 2], w[ 3], offset); + w[48] = amd_bytealign (w[ 1], w[ 2], offset); + w[47] = amd_bytealign (w[ 0], w[ 1], offset); + w[46] = amd_bytealign ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = amd_bytealign (w[15], w[16], offset); + w[62] = amd_bytealign (w[14], w[15], offset); + w[61] = amd_bytealign (w[13], w[14], offset); + w[60] = amd_bytealign (w[12], w[13], offset); + w[59] = amd_bytealign (w[11], w[12], offset); + w[58] = amd_bytealign (w[10], w[11], offset); + w[57] = amd_bytealign (w[ 9], w[10], offset); + w[56] = amd_bytealign (w[ 8], w[ 9], offset); + w[55] = amd_bytealign (w[ 7], w[ 8], offset); + w[54] = amd_bytealign (w[ 6], w[ 7], offset); + w[53] = amd_bytealign (w[ 5], w[ 6], offset); + w[52] = amd_bytealign (w[ 4], w[ 5], offset); + w[51] = amd_bytealign (w[ 3], w[ 4], offset); + w[50] = amd_bytealign (w[ 2], w[ 3], offset); + w[49] = amd_bytealign (w[ 1], w[ 2], offset); + w[48] = amd_bytealign (w[ 0], w[ 1], offset); + w[47] = amd_bytealign ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = amd_bytealign (w[14], w[15], offset); + w[62] = amd_bytealign (w[13], w[14], offset); + w[61] = amd_bytealign (w[12], w[13], offset); + w[60] = amd_bytealign (w[11], w[12], offset); + w[59] = amd_bytealign (w[10], w[11], offset); + w[58] = amd_bytealign (w[ 9], w[10], offset); + w[57] = amd_bytealign (w[ 8], w[ 9], offset); + w[56] = amd_bytealign (w[ 7], w[ 8], offset); + w[55] = amd_bytealign (w[ 6], w[ 7], offset); + w[54] = amd_bytealign (w[ 5], w[ 6], offset); + w[53] = amd_bytealign (w[ 4], w[ 5], offset); + w[52] = amd_bytealign (w[ 3], w[ 4], offset); + w[51] = amd_bytealign (w[ 2], w[ 3], offset); + w[50] = amd_bytealign (w[ 1], w[ 2], offset); + w[49] = amd_bytealign (w[ 0], w[ 1], offset); + w[48] = amd_bytealign ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = amd_bytealign (w[13], w[14], offset); + w[62] = amd_bytealign (w[12], w[13], offset); + w[61] = amd_bytealign (w[11], w[12], offset); + w[60] = amd_bytealign (w[10], w[11], offset); + w[59] = amd_bytealign (w[ 9], w[10], offset); + w[58] = amd_bytealign (w[ 8], w[ 9], offset); + w[57] = amd_bytealign (w[ 7], w[ 8], offset); + w[56] = amd_bytealign (w[ 6], w[ 7], offset); + w[55] = amd_bytealign (w[ 5], w[ 6], offset); + w[54] = amd_bytealign (w[ 4], w[ 5], offset); + w[53] = amd_bytealign (w[ 3], w[ 4], offset); + w[52] = amd_bytealign (w[ 2], w[ 3], offset); + w[51] = amd_bytealign (w[ 1], w[ 2], offset); + w[50] = amd_bytealign (w[ 0], w[ 1], offset); + w[49] = amd_bytealign ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = amd_bytealign (w[12], w[13], offset); + w[62] = amd_bytealign (w[11], w[12], offset); + w[61] = amd_bytealign (w[10], w[11], offset); + w[60] = amd_bytealign (w[ 9], w[10], offset); + w[59] = amd_bytealign (w[ 8], w[ 9], offset); + w[58] = amd_bytealign (w[ 7], w[ 8], offset); + w[57] = amd_bytealign (w[ 6], w[ 7], offset); + w[56] = amd_bytealign (w[ 5], w[ 6], offset); + w[55] = amd_bytealign (w[ 4], w[ 5], offset); + w[54] = amd_bytealign (w[ 3], w[ 4], offset); + w[53] = amd_bytealign (w[ 2], w[ 3], offset); + w[52] = amd_bytealign (w[ 1], w[ 2], offset); + w[51] = amd_bytealign (w[ 0], w[ 1], offset); + w[50] = amd_bytealign ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = amd_bytealign (w[11], w[12], offset); + w[62] = amd_bytealign (w[10], w[11], offset); + w[61] = amd_bytealign (w[ 9], w[10], offset); + w[60] = amd_bytealign (w[ 8], w[ 9], offset); + w[59] = amd_bytealign (w[ 7], w[ 8], offset); + w[58] = amd_bytealign (w[ 6], w[ 7], offset); + w[57] = amd_bytealign (w[ 5], w[ 6], offset); + w[56] = amd_bytealign (w[ 4], w[ 5], offset); + w[55] = amd_bytealign (w[ 3], w[ 4], offset); + w[54] = amd_bytealign (w[ 2], w[ 3], offset); + w[53] = amd_bytealign (w[ 1], w[ 2], offset); + w[52] = amd_bytealign (w[ 0], w[ 1], offset); + w[51] = amd_bytealign ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = amd_bytealign (w[10], w[11], offset); + w[62] = amd_bytealign (w[ 9], w[10], offset); + w[61] = amd_bytealign (w[ 8], w[ 9], offset); + w[60] = amd_bytealign (w[ 7], w[ 8], offset); + w[59] = amd_bytealign (w[ 6], w[ 7], offset); + w[58] = amd_bytealign (w[ 5], w[ 6], offset); + w[57] = amd_bytealign (w[ 4], w[ 5], offset); + w[56] = amd_bytealign (w[ 3], w[ 4], offset); + w[55] = amd_bytealign (w[ 2], w[ 3], offset); + w[54] = amd_bytealign (w[ 1], w[ 2], offset); + w[53] = amd_bytealign (w[ 0], w[ 1], offset); + w[52] = amd_bytealign ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = amd_bytealign (w[ 9], w[10], offset); + w[62] = amd_bytealign (w[ 8], w[ 9], offset); + w[61] = amd_bytealign (w[ 7], w[ 8], offset); + w[60] = amd_bytealign (w[ 6], w[ 7], offset); + w[59] = amd_bytealign (w[ 5], w[ 6], offset); + w[58] = amd_bytealign (w[ 4], w[ 5], offset); + w[57] = amd_bytealign (w[ 3], w[ 4], offset); + w[56] = amd_bytealign (w[ 2], w[ 3], offset); + w[55] = amd_bytealign (w[ 1], w[ 2], offset); + w[54] = amd_bytealign (w[ 0], w[ 1], offset); + w[53] = amd_bytealign ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = amd_bytealign (w[ 8], w[ 9], offset); + w[62] = amd_bytealign (w[ 7], w[ 8], offset); + w[61] = amd_bytealign (w[ 6], w[ 7], offset); + w[60] = amd_bytealign (w[ 5], w[ 6], offset); + w[59] = amd_bytealign (w[ 4], w[ 5], offset); + w[58] = amd_bytealign (w[ 3], w[ 4], offset); + w[57] = amd_bytealign (w[ 2], w[ 3], offset); + w[56] = amd_bytealign (w[ 1], w[ 2], offset); + w[55] = amd_bytealign (w[ 0], w[ 1], offset); + w[54] = amd_bytealign ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = amd_bytealign (w[ 7], w[ 8], offset); + w[62] = amd_bytealign (w[ 6], w[ 7], offset); + w[61] = amd_bytealign (w[ 5], w[ 6], offset); + w[60] = amd_bytealign (w[ 4], w[ 5], offset); + w[59] = amd_bytealign (w[ 3], w[ 4], offset); + w[58] = amd_bytealign (w[ 2], w[ 3], offset); + w[57] = amd_bytealign (w[ 1], w[ 2], offset); + w[56] = amd_bytealign (w[ 0], w[ 1], offset); + w[55] = amd_bytealign ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = amd_bytealign (w[ 6], w[ 7], offset); + w[62] = amd_bytealign (w[ 5], w[ 6], offset); + w[61] = amd_bytealign (w[ 4], w[ 5], offset); + w[60] = amd_bytealign (w[ 3], w[ 4], offset); + w[59] = amd_bytealign (w[ 2], w[ 3], offset); + w[58] = amd_bytealign (w[ 1], w[ 2], offset); + w[57] = amd_bytealign (w[ 0], w[ 1], offset); + w[56] = amd_bytealign ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = amd_bytealign (w[ 5], w[ 6], offset); + w[62] = amd_bytealign (w[ 4], w[ 5], offset); + w[61] = amd_bytealign (w[ 3], w[ 4], offset); + w[60] = amd_bytealign (w[ 2], w[ 3], offset); + w[59] = amd_bytealign (w[ 1], w[ 2], offset); + w[58] = amd_bytealign (w[ 0], w[ 1], offset); + w[57] = amd_bytealign ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = amd_bytealign (w[ 4], w[ 5], offset); + w[62] = amd_bytealign (w[ 3], w[ 4], offset); + w[61] = amd_bytealign (w[ 2], w[ 3], offset); + w[60] = amd_bytealign (w[ 1], w[ 2], offset); + w[59] = amd_bytealign (w[ 0], w[ 1], offset); + w[58] = amd_bytealign ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = amd_bytealign (w[ 3], w[ 4], offset); + w[62] = amd_bytealign (w[ 2], w[ 3], offset); + w[61] = amd_bytealign (w[ 1], w[ 2], offset); + w[60] = amd_bytealign (w[ 0], w[ 1], offset); + w[59] = amd_bytealign ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = amd_bytealign (w[ 2], w[ 3], offset); + w[62] = amd_bytealign (w[ 1], w[ 2], offset); + w[61] = amd_bytealign (w[ 0], w[ 1], offset); + w[60] = amd_bytealign ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = amd_bytealign (w[ 1], w[ 2], offset); + w[62] = amd_bytealign (w[ 0], w[ 1], offset); + w[61] = amd_bytealign ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = amd_bytealign (w[ 0], w[ 1], offset); + w[62] = amd_bytealign ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = amd_bytealign ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w[63] = __byte_perm (w[63], w[62], selector); + w[62] = __byte_perm (w[62], w[61], selector); + w[61] = __byte_perm (w[61], w[60], selector); + w[60] = __byte_perm (w[60], w[59], selector); + w[59] = __byte_perm (w[59], w[58], selector); + w[58] = __byte_perm (w[58], w[57], selector); + w[57] = __byte_perm (w[57], w[56], selector); + w[56] = __byte_perm (w[56], w[55], selector); + w[55] = __byte_perm (w[55], w[54], selector); + w[54] = __byte_perm (w[54], w[53], selector); + w[53] = __byte_perm (w[53], w[52], selector); + w[52] = __byte_perm (w[52], w[51], selector); + w[51] = __byte_perm (w[51], w[50], selector); + w[50] = __byte_perm (w[50], w[49], selector); + w[49] = __byte_perm (w[49], w[48], selector); + w[48] = __byte_perm (w[48], w[47], selector); + w[47] = __byte_perm (w[47], w[46], selector); + w[46] = __byte_perm (w[46], w[45], selector); + w[45] = __byte_perm (w[45], w[44], selector); + w[44] = __byte_perm (w[44], w[43], selector); + w[43] = __byte_perm (w[43], w[42], selector); + w[42] = __byte_perm (w[42], w[41], selector); + w[41] = __byte_perm (w[41], w[40], selector); + w[40] = __byte_perm (w[40], w[39], selector); + w[39] = __byte_perm (w[39], w[38], selector); + w[38] = __byte_perm (w[38], w[37], selector); + w[37] = __byte_perm (w[37], w[36], selector); + w[36] = __byte_perm (w[36], w[35], selector); + w[35] = __byte_perm (w[35], w[34], selector); + w[34] = __byte_perm (w[34], w[33], selector); + w[33] = __byte_perm (w[33], w[32], selector); + w[32] = __byte_perm (w[32], w[31], selector); + w[31] = __byte_perm (w[31], w[30], selector); + w[30] = __byte_perm (w[30], w[29], selector); + w[29] = __byte_perm (w[29], w[28], selector); + w[28] = __byte_perm (w[28], w[27], selector); + w[27] = __byte_perm (w[27], w[26], selector); + w[26] = __byte_perm (w[26], w[25], selector); + w[25] = __byte_perm (w[25], w[24], selector); + w[24] = __byte_perm (w[24], w[23], selector); + w[23] = __byte_perm (w[23], w[22], selector); + w[22] = __byte_perm (w[22], w[21], selector); + w[21] = __byte_perm (w[21], w[20], selector); + w[20] = __byte_perm (w[20], w[19], selector); + w[19] = __byte_perm (w[19], w[18], selector); + w[18] = __byte_perm (w[18], w[17], selector); + w[17] = __byte_perm (w[17], w[16], selector); + w[16] = __byte_perm (w[16], w[15], selector); + w[15] = __byte_perm (w[15], w[14], selector); + w[14] = __byte_perm (w[14], w[13], selector); + w[13] = __byte_perm (w[13], w[12], selector); + w[12] = __byte_perm (w[12], w[11], selector); + w[11] = __byte_perm (w[11], w[10], selector); + w[10] = __byte_perm (w[10], w[ 9], selector); + w[ 9] = __byte_perm (w[ 9], w[ 8], selector); + w[ 8] = __byte_perm (w[ 8], w[ 7], selector); + w[ 7] = __byte_perm (w[ 7], w[ 6], selector); + w[ 6] = __byte_perm (w[ 6], w[ 5], selector); + w[ 5] = __byte_perm (w[ 5], w[ 4], selector); + w[ 4] = __byte_perm (w[ 4], w[ 3], selector); + w[ 3] = __byte_perm (w[ 3], w[ 2], selector); + w[ 2] = __byte_perm (w[ 2], w[ 1], selector); + w[ 1] = __byte_perm (w[ 1], w[ 0], selector); + w[ 0] = __byte_perm (w[ 0], 0, selector); + + break; + + case 1: + w[63] = __byte_perm (w[62], w[61], selector); + w[62] = __byte_perm (w[61], w[60], selector); + w[61] = __byte_perm (w[60], w[59], selector); + w[60] = __byte_perm (w[59], w[58], selector); + w[59] = __byte_perm (w[58], w[57], selector); + w[58] = __byte_perm (w[57], w[56], selector); + w[57] = __byte_perm (w[56], w[55], selector); + w[56] = __byte_perm (w[55], w[54], selector); + w[55] = __byte_perm (w[54], w[53], selector); + w[54] = __byte_perm (w[53], w[52], selector); + w[53] = __byte_perm (w[52], w[51], selector); + w[52] = __byte_perm (w[51], w[50], selector); + w[51] = __byte_perm (w[50], w[49], selector); + w[50] = __byte_perm (w[49], w[48], selector); + w[49] = __byte_perm (w[48], w[47], selector); + w[48] = __byte_perm (w[47], w[46], selector); + w[47] = __byte_perm (w[46], w[45], selector); + w[46] = __byte_perm (w[45], w[44], selector); + w[45] = __byte_perm (w[44], w[43], selector); + w[44] = __byte_perm (w[43], w[42], selector); + w[43] = __byte_perm (w[42], w[41], selector); + w[42] = __byte_perm (w[41], w[40], selector); + w[41] = __byte_perm (w[40], w[39], selector); + w[40] = __byte_perm (w[39], w[38], selector); + w[39] = __byte_perm (w[38], w[37], selector); + w[38] = __byte_perm (w[37], w[36], selector); + w[37] = __byte_perm (w[36], w[35], selector); + w[36] = __byte_perm (w[35], w[34], selector); + w[35] = __byte_perm (w[34], w[33], selector); + w[34] = __byte_perm (w[33], w[32], selector); + w[33] = __byte_perm (w[32], w[31], selector); + w[32] = __byte_perm (w[31], w[30], selector); + w[31] = __byte_perm (w[30], w[29], selector); + w[30] = __byte_perm (w[29], w[28], selector); + w[29] = __byte_perm (w[28], w[27], selector); + w[28] = __byte_perm (w[27], w[26], selector); + w[27] = __byte_perm (w[26], w[25], selector); + w[26] = __byte_perm (w[25], w[24], selector); + w[25] = __byte_perm (w[24], w[23], selector); + w[24] = __byte_perm (w[23], w[22], selector); + w[23] = __byte_perm (w[22], w[21], selector); + w[22] = __byte_perm (w[21], w[20], selector); + w[21] = __byte_perm (w[20], w[19], selector); + w[20] = __byte_perm (w[19], w[18], selector); + w[19] = __byte_perm (w[18], w[17], selector); + w[18] = __byte_perm (w[17], w[16], selector); + w[17] = __byte_perm (w[16], w[15], selector); + w[16] = __byte_perm (w[15], w[14], selector); + w[15] = __byte_perm (w[14], w[13], selector); + w[14] = __byte_perm (w[13], w[12], selector); + w[13] = __byte_perm (w[12], w[11], selector); + w[12] = __byte_perm (w[11], w[10], selector); + w[11] = __byte_perm (w[10], w[ 9], selector); + w[10] = __byte_perm (w[ 9], w[ 8], selector); + w[ 9] = __byte_perm (w[ 8], w[ 7], selector); + w[ 8] = __byte_perm (w[ 7], w[ 6], selector); + w[ 7] = __byte_perm (w[ 6], w[ 5], selector); + w[ 6] = __byte_perm (w[ 5], w[ 4], selector); + w[ 5] = __byte_perm (w[ 4], w[ 3], selector); + w[ 4] = __byte_perm (w[ 3], w[ 2], selector); + w[ 3] = __byte_perm (w[ 2], w[ 1], selector); + w[ 2] = __byte_perm (w[ 1], w[ 0], selector); + w[ 1] = __byte_perm (w[ 0], 0, selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = __byte_perm (w[61], w[60], selector); + w[62] = __byte_perm (w[60], w[59], selector); + w[61] = __byte_perm (w[59], w[58], selector); + w[60] = __byte_perm (w[58], w[57], selector); + w[59] = __byte_perm (w[57], w[56], selector); + w[58] = __byte_perm (w[56], w[55], selector); + w[57] = __byte_perm (w[55], w[54], selector); + w[56] = __byte_perm (w[54], w[53], selector); + w[55] = __byte_perm (w[53], w[52], selector); + w[54] = __byte_perm (w[52], w[51], selector); + w[53] = __byte_perm (w[51], w[50], selector); + w[52] = __byte_perm (w[50], w[49], selector); + w[51] = __byte_perm (w[49], w[48], selector); + w[50] = __byte_perm (w[48], w[47], selector); + w[49] = __byte_perm (w[47], w[46], selector); + w[48] = __byte_perm (w[46], w[45], selector); + w[47] = __byte_perm (w[45], w[44], selector); + w[46] = __byte_perm (w[44], w[43], selector); + w[45] = __byte_perm (w[43], w[42], selector); + w[44] = __byte_perm (w[42], w[41], selector); + w[43] = __byte_perm (w[41], w[40], selector); + w[42] = __byte_perm (w[40], w[39], selector); + w[41] = __byte_perm (w[39], w[38], selector); + w[40] = __byte_perm (w[38], w[37], selector); + w[39] = __byte_perm (w[37], w[36], selector); + w[38] = __byte_perm (w[36], w[35], selector); + w[37] = __byte_perm (w[35], w[34], selector); + w[36] = __byte_perm (w[34], w[33], selector); + w[35] = __byte_perm (w[33], w[32], selector); + w[34] = __byte_perm (w[32], w[31], selector); + w[33] = __byte_perm (w[31], w[30], selector); + w[32] = __byte_perm (w[30], w[29], selector); + w[31] = __byte_perm (w[29], w[28], selector); + w[30] = __byte_perm (w[28], w[27], selector); + w[29] = __byte_perm (w[27], w[26], selector); + w[28] = __byte_perm (w[26], w[25], selector); + w[27] = __byte_perm (w[25], w[24], selector); + w[26] = __byte_perm (w[24], w[23], selector); + w[25] = __byte_perm (w[23], w[22], selector); + w[24] = __byte_perm (w[22], w[21], selector); + w[23] = __byte_perm (w[21], w[20], selector); + w[22] = __byte_perm (w[20], w[19], selector); + w[21] = __byte_perm (w[19], w[18], selector); + w[20] = __byte_perm (w[18], w[17], selector); + w[19] = __byte_perm (w[17], w[16], selector); + w[18] = __byte_perm (w[16], w[15], selector); + w[17] = __byte_perm (w[15], w[14], selector); + w[16] = __byte_perm (w[14], w[13], selector); + w[15] = __byte_perm (w[13], w[12], selector); + w[14] = __byte_perm (w[12], w[11], selector); + w[13] = __byte_perm (w[11], w[10], selector); + w[12] = __byte_perm (w[10], w[ 9], selector); + w[11] = __byte_perm (w[ 9], w[ 8], selector); + w[10] = __byte_perm (w[ 8], w[ 7], selector); + w[ 9] = __byte_perm (w[ 7], w[ 6], selector); + w[ 8] = __byte_perm (w[ 6], w[ 5], selector); + w[ 7] = __byte_perm (w[ 5], w[ 4], selector); + w[ 6] = __byte_perm (w[ 4], w[ 3], selector); + w[ 5] = __byte_perm (w[ 3], w[ 2], selector); + w[ 4] = __byte_perm (w[ 2], w[ 1], selector); + w[ 3] = __byte_perm (w[ 1], w[ 0], selector); + w[ 2] = __byte_perm (w[ 0], 0, selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = __byte_perm (w[60], w[59], selector); + w[62] = __byte_perm (w[59], w[58], selector); + w[61] = __byte_perm (w[58], w[57], selector); + w[60] = __byte_perm (w[57], w[56], selector); + w[59] = __byte_perm (w[56], w[55], selector); + w[58] = __byte_perm (w[55], w[54], selector); + w[57] = __byte_perm (w[54], w[53], selector); + w[56] = __byte_perm (w[53], w[52], selector); + w[55] = __byte_perm (w[52], w[51], selector); + w[54] = __byte_perm (w[51], w[50], selector); + w[53] = __byte_perm (w[50], w[49], selector); + w[52] = __byte_perm (w[49], w[48], selector); + w[51] = __byte_perm (w[48], w[47], selector); + w[50] = __byte_perm (w[47], w[46], selector); + w[49] = __byte_perm (w[46], w[45], selector); + w[48] = __byte_perm (w[45], w[44], selector); + w[47] = __byte_perm (w[44], w[43], selector); + w[46] = __byte_perm (w[43], w[42], selector); + w[45] = __byte_perm (w[42], w[41], selector); + w[44] = __byte_perm (w[41], w[40], selector); + w[43] = __byte_perm (w[40], w[39], selector); + w[42] = __byte_perm (w[39], w[38], selector); + w[41] = __byte_perm (w[38], w[37], selector); + w[40] = __byte_perm (w[37], w[36], selector); + w[39] = __byte_perm (w[36], w[35], selector); + w[38] = __byte_perm (w[35], w[34], selector); + w[37] = __byte_perm (w[34], w[33], selector); + w[36] = __byte_perm (w[33], w[32], selector); + w[35] = __byte_perm (w[32], w[31], selector); + w[34] = __byte_perm (w[31], w[30], selector); + w[33] = __byte_perm (w[30], w[29], selector); + w[32] = __byte_perm (w[29], w[28], selector); + w[31] = __byte_perm (w[28], w[27], selector); + w[30] = __byte_perm (w[27], w[26], selector); + w[29] = __byte_perm (w[26], w[25], selector); + w[28] = __byte_perm (w[25], w[24], selector); + w[27] = __byte_perm (w[24], w[23], selector); + w[26] = __byte_perm (w[23], w[22], selector); + w[25] = __byte_perm (w[22], w[21], selector); + w[24] = __byte_perm (w[21], w[20], selector); + w[23] = __byte_perm (w[20], w[19], selector); + w[22] = __byte_perm (w[19], w[18], selector); + w[21] = __byte_perm (w[18], w[17], selector); + w[20] = __byte_perm (w[17], w[16], selector); + w[19] = __byte_perm (w[16], w[15], selector); + w[18] = __byte_perm (w[15], w[14], selector); + w[17] = __byte_perm (w[14], w[13], selector); + w[16] = __byte_perm (w[13], w[12], selector); + w[15] = __byte_perm (w[12], w[11], selector); + w[14] = __byte_perm (w[11], w[10], selector); + w[13] = __byte_perm (w[10], w[ 9], selector); + w[12] = __byte_perm (w[ 9], w[ 8], selector); + w[11] = __byte_perm (w[ 8], w[ 7], selector); + w[10] = __byte_perm (w[ 7], w[ 6], selector); + w[ 9] = __byte_perm (w[ 6], w[ 5], selector); + w[ 8] = __byte_perm (w[ 5], w[ 4], selector); + w[ 7] = __byte_perm (w[ 4], w[ 3], selector); + w[ 6] = __byte_perm (w[ 3], w[ 2], selector); + w[ 5] = __byte_perm (w[ 2], w[ 1], selector); + w[ 4] = __byte_perm (w[ 1], w[ 0], selector); + w[ 3] = __byte_perm (w[ 0], 0, selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = __byte_perm (w[59], w[58], selector); + w[62] = __byte_perm (w[58], w[57], selector); + w[61] = __byte_perm (w[57], w[56], selector); + w[60] = __byte_perm (w[56], w[55], selector); + w[59] = __byte_perm (w[55], w[54], selector); + w[58] = __byte_perm (w[54], w[53], selector); + w[57] = __byte_perm (w[53], w[52], selector); + w[56] = __byte_perm (w[52], w[51], selector); + w[55] = __byte_perm (w[51], w[50], selector); + w[54] = __byte_perm (w[50], w[49], selector); + w[53] = __byte_perm (w[49], w[48], selector); + w[52] = __byte_perm (w[48], w[47], selector); + w[51] = __byte_perm (w[47], w[46], selector); + w[50] = __byte_perm (w[46], w[45], selector); + w[49] = __byte_perm (w[45], w[44], selector); + w[48] = __byte_perm (w[44], w[43], selector); + w[47] = __byte_perm (w[43], w[42], selector); + w[46] = __byte_perm (w[42], w[41], selector); + w[45] = __byte_perm (w[41], w[40], selector); + w[44] = __byte_perm (w[40], w[39], selector); + w[43] = __byte_perm (w[39], w[38], selector); + w[42] = __byte_perm (w[38], w[37], selector); + w[41] = __byte_perm (w[37], w[36], selector); + w[40] = __byte_perm (w[36], w[35], selector); + w[39] = __byte_perm (w[35], w[34], selector); + w[38] = __byte_perm (w[34], w[33], selector); + w[37] = __byte_perm (w[33], w[32], selector); + w[36] = __byte_perm (w[32], w[31], selector); + w[35] = __byte_perm (w[31], w[30], selector); + w[34] = __byte_perm (w[30], w[29], selector); + w[33] = __byte_perm (w[29], w[28], selector); + w[32] = __byte_perm (w[28], w[27], selector); + w[31] = __byte_perm (w[27], w[26], selector); + w[30] = __byte_perm (w[26], w[25], selector); + w[29] = __byte_perm (w[25], w[24], selector); + w[28] = __byte_perm (w[24], w[23], selector); + w[27] = __byte_perm (w[23], w[22], selector); + w[26] = __byte_perm (w[22], w[21], selector); + w[25] = __byte_perm (w[21], w[20], selector); + w[24] = __byte_perm (w[20], w[19], selector); + w[23] = __byte_perm (w[19], w[18], selector); + w[22] = __byte_perm (w[18], w[17], selector); + w[21] = __byte_perm (w[17], w[16], selector); + w[20] = __byte_perm (w[16], w[15], selector); + w[19] = __byte_perm (w[15], w[14], selector); + w[18] = __byte_perm (w[14], w[13], selector); + w[17] = __byte_perm (w[13], w[12], selector); + w[16] = __byte_perm (w[12], w[11], selector); + w[15] = __byte_perm (w[11], w[10], selector); + w[14] = __byte_perm (w[10], w[ 9], selector); + w[13] = __byte_perm (w[ 9], w[ 8], selector); + w[12] = __byte_perm (w[ 8], w[ 7], selector); + w[11] = __byte_perm (w[ 7], w[ 6], selector); + w[10] = __byte_perm (w[ 6], w[ 5], selector); + w[ 9] = __byte_perm (w[ 5], w[ 4], selector); + w[ 8] = __byte_perm (w[ 4], w[ 3], selector); + w[ 7] = __byte_perm (w[ 3], w[ 2], selector); + w[ 6] = __byte_perm (w[ 2], w[ 1], selector); + w[ 5] = __byte_perm (w[ 1], w[ 0], selector); + w[ 4] = __byte_perm (w[ 0], 0, selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = __byte_perm (w[58], w[57], selector); + w[62] = __byte_perm (w[57], w[56], selector); + w[61] = __byte_perm (w[56], w[55], selector); + w[60] = __byte_perm (w[55], w[54], selector); + w[59] = __byte_perm (w[54], w[53], selector); + w[58] = __byte_perm (w[53], w[52], selector); + w[57] = __byte_perm (w[52], w[51], selector); + w[56] = __byte_perm (w[51], w[50], selector); + w[55] = __byte_perm (w[50], w[49], selector); + w[54] = __byte_perm (w[49], w[48], selector); + w[53] = __byte_perm (w[48], w[47], selector); + w[52] = __byte_perm (w[47], w[46], selector); + w[51] = __byte_perm (w[46], w[45], selector); + w[50] = __byte_perm (w[45], w[44], selector); + w[49] = __byte_perm (w[44], w[43], selector); + w[48] = __byte_perm (w[43], w[42], selector); + w[47] = __byte_perm (w[42], w[41], selector); + w[46] = __byte_perm (w[41], w[40], selector); + w[45] = __byte_perm (w[40], w[39], selector); + w[44] = __byte_perm (w[39], w[38], selector); + w[43] = __byte_perm (w[38], w[37], selector); + w[42] = __byte_perm (w[37], w[36], selector); + w[41] = __byte_perm (w[36], w[35], selector); + w[40] = __byte_perm (w[35], w[34], selector); + w[39] = __byte_perm (w[34], w[33], selector); + w[38] = __byte_perm (w[33], w[32], selector); + w[37] = __byte_perm (w[32], w[31], selector); + w[36] = __byte_perm (w[31], w[30], selector); + w[35] = __byte_perm (w[30], w[29], selector); + w[34] = __byte_perm (w[29], w[28], selector); + w[33] = __byte_perm (w[28], w[27], selector); + w[32] = __byte_perm (w[27], w[26], selector); + w[31] = __byte_perm (w[26], w[25], selector); + w[30] = __byte_perm (w[25], w[24], selector); + w[29] = __byte_perm (w[24], w[23], selector); + w[28] = __byte_perm (w[23], w[22], selector); + w[27] = __byte_perm (w[22], w[21], selector); + w[26] = __byte_perm (w[21], w[20], selector); + w[25] = __byte_perm (w[20], w[19], selector); + w[24] = __byte_perm (w[19], w[18], selector); + w[23] = __byte_perm (w[18], w[17], selector); + w[22] = __byte_perm (w[17], w[16], selector); + w[21] = __byte_perm (w[16], w[15], selector); + w[20] = __byte_perm (w[15], w[14], selector); + w[19] = __byte_perm (w[14], w[13], selector); + w[18] = __byte_perm (w[13], w[12], selector); + w[17] = __byte_perm (w[12], w[11], selector); + w[16] = __byte_perm (w[11], w[10], selector); + w[15] = __byte_perm (w[10], w[ 9], selector); + w[14] = __byte_perm (w[ 9], w[ 8], selector); + w[13] = __byte_perm (w[ 8], w[ 7], selector); + w[12] = __byte_perm (w[ 7], w[ 6], selector); + w[11] = __byte_perm (w[ 6], w[ 5], selector); + w[10] = __byte_perm (w[ 5], w[ 4], selector); + w[ 9] = __byte_perm (w[ 4], w[ 3], selector); + w[ 8] = __byte_perm (w[ 3], w[ 2], selector); + w[ 7] = __byte_perm (w[ 2], w[ 1], selector); + w[ 6] = __byte_perm (w[ 1], w[ 0], selector); + w[ 5] = __byte_perm (w[ 0], 0, selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = __byte_perm (w[57], w[56], selector); + w[62] = __byte_perm (w[56], w[55], selector); + w[61] = __byte_perm (w[55], w[54], selector); + w[60] = __byte_perm (w[54], w[53], selector); + w[59] = __byte_perm (w[53], w[52], selector); + w[58] = __byte_perm (w[52], w[51], selector); + w[57] = __byte_perm (w[51], w[50], selector); + w[56] = __byte_perm (w[50], w[49], selector); + w[55] = __byte_perm (w[49], w[48], selector); + w[54] = __byte_perm (w[48], w[47], selector); + w[53] = __byte_perm (w[47], w[46], selector); + w[52] = __byte_perm (w[46], w[45], selector); + w[51] = __byte_perm (w[45], w[44], selector); + w[50] = __byte_perm (w[44], w[43], selector); + w[49] = __byte_perm (w[43], w[42], selector); + w[48] = __byte_perm (w[42], w[41], selector); + w[47] = __byte_perm (w[41], w[40], selector); + w[46] = __byte_perm (w[40], w[39], selector); + w[45] = __byte_perm (w[39], w[38], selector); + w[44] = __byte_perm (w[38], w[37], selector); + w[43] = __byte_perm (w[37], w[36], selector); + w[42] = __byte_perm (w[36], w[35], selector); + w[41] = __byte_perm (w[35], w[34], selector); + w[40] = __byte_perm (w[34], w[33], selector); + w[39] = __byte_perm (w[33], w[32], selector); + w[38] = __byte_perm (w[32], w[31], selector); + w[37] = __byte_perm (w[31], w[30], selector); + w[36] = __byte_perm (w[30], w[29], selector); + w[35] = __byte_perm (w[29], w[28], selector); + w[34] = __byte_perm (w[28], w[27], selector); + w[33] = __byte_perm (w[27], w[26], selector); + w[32] = __byte_perm (w[26], w[25], selector); + w[31] = __byte_perm (w[25], w[24], selector); + w[30] = __byte_perm (w[24], w[23], selector); + w[29] = __byte_perm (w[23], w[22], selector); + w[28] = __byte_perm (w[22], w[21], selector); + w[27] = __byte_perm (w[21], w[20], selector); + w[26] = __byte_perm (w[20], w[19], selector); + w[25] = __byte_perm (w[19], w[18], selector); + w[24] = __byte_perm (w[18], w[17], selector); + w[23] = __byte_perm (w[17], w[16], selector); + w[22] = __byte_perm (w[16], w[15], selector); + w[21] = __byte_perm (w[15], w[14], selector); + w[20] = __byte_perm (w[14], w[13], selector); + w[19] = __byte_perm (w[13], w[12], selector); + w[18] = __byte_perm (w[12], w[11], selector); + w[17] = __byte_perm (w[11], w[10], selector); + w[16] = __byte_perm (w[10], w[ 9], selector); + w[15] = __byte_perm (w[ 9], w[ 8], selector); + w[14] = __byte_perm (w[ 8], w[ 7], selector); + w[13] = __byte_perm (w[ 7], w[ 6], selector); + w[12] = __byte_perm (w[ 6], w[ 5], selector); + w[11] = __byte_perm (w[ 5], w[ 4], selector); + w[10] = __byte_perm (w[ 4], w[ 3], selector); + w[ 9] = __byte_perm (w[ 3], w[ 2], selector); + w[ 8] = __byte_perm (w[ 2], w[ 1], selector); + w[ 7] = __byte_perm (w[ 1], w[ 0], selector); + w[ 6] = __byte_perm (w[ 0], 0, selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = __byte_perm (w[56], w[55], selector); + w[62] = __byte_perm (w[55], w[54], selector); + w[61] = __byte_perm (w[54], w[53], selector); + w[60] = __byte_perm (w[53], w[52], selector); + w[59] = __byte_perm (w[52], w[51], selector); + w[58] = __byte_perm (w[51], w[50], selector); + w[57] = __byte_perm (w[50], w[49], selector); + w[56] = __byte_perm (w[49], w[48], selector); + w[55] = __byte_perm (w[48], w[47], selector); + w[54] = __byte_perm (w[47], w[46], selector); + w[53] = __byte_perm (w[46], w[45], selector); + w[52] = __byte_perm (w[45], w[44], selector); + w[51] = __byte_perm (w[44], w[43], selector); + w[50] = __byte_perm (w[43], w[42], selector); + w[49] = __byte_perm (w[42], w[41], selector); + w[48] = __byte_perm (w[41], w[40], selector); + w[47] = __byte_perm (w[40], w[39], selector); + w[46] = __byte_perm (w[39], w[38], selector); + w[45] = __byte_perm (w[38], w[37], selector); + w[44] = __byte_perm (w[37], w[36], selector); + w[43] = __byte_perm (w[36], w[35], selector); + w[42] = __byte_perm (w[35], w[34], selector); + w[41] = __byte_perm (w[34], w[33], selector); + w[40] = __byte_perm (w[33], w[32], selector); + w[39] = __byte_perm (w[32], w[31], selector); + w[38] = __byte_perm (w[31], w[30], selector); + w[37] = __byte_perm (w[30], w[29], selector); + w[36] = __byte_perm (w[29], w[28], selector); + w[35] = __byte_perm (w[28], w[27], selector); + w[34] = __byte_perm (w[27], w[26], selector); + w[33] = __byte_perm (w[26], w[25], selector); + w[32] = __byte_perm (w[25], w[24], selector); + w[31] = __byte_perm (w[24], w[23], selector); + w[30] = __byte_perm (w[23], w[22], selector); + w[29] = __byte_perm (w[22], w[21], selector); + w[28] = __byte_perm (w[21], w[20], selector); + w[27] = __byte_perm (w[20], w[19], selector); + w[26] = __byte_perm (w[19], w[18], selector); + w[25] = __byte_perm (w[18], w[17], selector); + w[24] = __byte_perm (w[17], w[16], selector); + w[23] = __byte_perm (w[16], w[15], selector); + w[22] = __byte_perm (w[15], w[14], selector); + w[21] = __byte_perm (w[14], w[13], selector); + w[20] = __byte_perm (w[13], w[12], selector); + w[19] = __byte_perm (w[12], w[11], selector); + w[18] = __byte_perm (w[11], w[10], selector); + w[17] = __byte_perm (w[10], w[ 9], selector); + w[16] = __byte_perm (w[ 9], w[ 8], selector); + w[15] = __byte_perm (w[ 8], w[ 7], selector); + w[14] = __byte_perm (w[ 7], w[ 6], selector); + w[13] = __byte_perm (w[ 6], w[ 5], selector); + w[12] = __byte_perm (w[ 5], w[ 4], selector); + w[11] = __byte_perm (w[ 4], w[ 3], selector); + w[10] = __byte_perm (w[ 3], w[ 2], selector); + w[ 9] = __byte_perm (w[ 2], w[ 1], selector); + w[ 8] = __byte_perm (w[ 1], w[ 0], selector); + w[ 7] = __byte_perm (w[ 0], 0, selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = __byte_perm (w[55], w[54], selector); + w[62] = __byte_perm (w[54], w[53], selector); + w[61] = __byte_perm (w[53], w[52], selector); + w[60] = __byte_perm (w[52], w[51], selector); + w[59] = __byte_perm (w[51], w[50], selector); + w[58] = __byte_perm (w[50], w[49], selector); + w[57] = __byte_perm (w[49], w[48], selector); + w[56] = __byte_perm (w[48], w[47], selector); + w[55] = __byte_perm (w[47], w[46], selector); + w[54] = __byte_perm (w[46], w[45], selector); + w[53] = __byte_perm (w[45], w[44], selector); + w[52] = __byte_perm (w[44], w[43], selector); + w[51] = __byte_perm (w[43], w[42], selector); + w[50] = __byte_perm (w[42], w[41], selector); + w[49] = __byte_perm (w[41], w[40], selector); + w[48] = __byte_perm (w[40], w[39], selector); + w[47] = __byte_perm (w[39], w[38], selector); + w[46] = __byte_perm (w[38], w[37], selector); + w[45] = __byte_perm (w[37], w[36], selector); + w[44] = __byte_perm (w[36], w[35], selector); + w[43] = __byte_perm (w[35], w[34], selector); + w[42] = __byte_perm (w[34], w[33], selector); + w[41] = __byte_perm (w[33], w[32], selector); + w[40] = __byte_perm (w[32], w[31], selector); + w[39] = __byte_perm (w[31], w[30], selector); + w[38] = __byte_perm (w[30], w[29], selector); + w[37] = __byte_perm (w[29], w[28], selector); + w[36] = __byte_perm (w[28], w[27], selector); + w[35] = __byte_perm (w[27], w[26], selector); + w[34] = __byte_perm (w[26], w[25], selector); + w[33] = __byte_perm (w[25], w[24], selector); + w[32] = __byte_perm (w[24], w[23], selector); + w[31] = __byte_perm (w[23], w[22], selector); + w[30] = __byte_perm (w[22], w[21], selector); + w[29] = __byte_perm (w[21], w[20], selector); + w[28] = __byte_perm (w[20], w[19], selector); + w[27] = __byte_perm (w[19], w[18], selector); + w[26] = __byte_perm (w[18], w[17], selector); + w[25] = __byte_perm (w[17], w[16], selector); + w[24] = __byte_perm (w[16], w[15], selector); + w[23] = __byte_perm (w[15], w[14], selector); + w[22] = __byte_perm (w[14], w[13], selector); + w[21] = __byte_perm (w[13], w[12], selector); + w[20] = __byte_perm (w[12], w[11], selector); + w[19] = __byte_perm (w[11], w[10], selector); + w[18] = __byte_perm (w[10], w[ 9], selector); + w[17] = __byte_perm (w[ 9], w[ 8], selector); + w[16] = __byte_perm (w[ 8], w[ 7], selector); + w[15] = __byte_perm (w[ 7], w[ 6], selector); + w[14] = __byte_perm (w[ 6], w[ 5], selector); + w[13] = __byte_perm (w[ 5], w[ 4], selector); + w[12] = __byte_perm (w[ 4], w[ 3], selector); + w[11] = __byte_perm (w[ 3], w[ 2], selector); + w[10] = __byte_perm (w[ 2], w[ 1], selector); + w[ 9] = __byte_perm (w[ 1], w[ 0], selector); + w[ 8] = __byte_perm (w[ 0], 0, selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = __byte_perm (w[54], w[53], selector); + w[62] = __byte_perm (w[53], w[52], selector); + w[61] = __byte_perm (w[52], w[51], selector); + w[60] = __byte_perm (w[51], w[50], selector); + w[59] = __byte_perm (w[50], w[49], selector); + w[58] = __byte_perm (w[49], w[48], selector); + w[57] = __byte_perm (w[48], w[47], selector); + w[56] = __byte_perm (w[47], w[46], selector); + w[55] = __byte_perm (w[46], w[45], selector); + w[54] = __byte_perm (w[45], w[44], selector); + w[53] = __byte_perm (w[44], w[43], selector); + w[52] = __byte_perm (w[43], w[42], selector); + w[51] = __byte_perm (w[42], w[41], selector); + w[50] = __byte_perm (w[41], w[40], selector); + w[49] = __byte_perm (w[40], w[39], selector); + w[48] = __byte_perm (w[39], w[38], selector); + w[47] = __byte_perm (w[38], w[37], selector); + w[46] = __byte_perm (w[37], w[36], selector); + w[45] = __byte_perm (w[36], w[35], selector); + w[44] = __byte_perm (w[35], w[34], selector); + w[43] = __byte_perm (w[34], w[33], selector); + w[42] = __byte_perm (w[33], w[32], selector); + w[41] = __byte_perm (w[32], w[31], selector); + w[40] = __byte_perm (w[31], w[30], selector); + w[39] = __byte_perm (w[30], w[29], selector); + w[38] = __byte_perm (w[29], w[28], selector); + w[37] = __byte_perm (w[28], w[27], selector); + w[36] = __byte_perm (w[27], w[26], selector); + w[35] = __byte_perm (w[26], w[25], selector); + w[34] = __byte_perm (w[25], w[24], selector); + w[33] = __byte_perm (w[24], w[23], selector); + w[32] = __byte_perm (w[23], w[22], selector); + w[31] = __byte_perm (w[22], w[21], selector); + w[30] = __byte_perm (w[21], w[20], selector); + w[29] = __byte_perm (w[20], w[19], selector); + w[28] = __byte_perm (w[19], w[18], selector); + w[27] = __byte_perm (w[18], w[17], selector); + w[26] = __byte_perm (w[17], w[16], selector); + w[25] = __byte_perm (w[16], w[15], selector); + w[24] = __byte_perm (w[15], w[14], selector); + w[23] = __byte_perm (w[14], w[13], selector); + w[22] = __byte_perm (w[13], w[12], selector); + w[21] = __byte_perm (w[12], w[11], selector); + w[20] = __byte_perm (w[11], w[10], selector); + w[19] = __byte_perm (w[10], w[ 9], selector); + w[18] = __byte_perm (w[ 9], w[ 8], selector); + w[17] = __byte_perm (w[ 8], w[ 7], selector); + w[16] = __byte_perm (w[ 7], w[ 6], selector); + w[15] = __byte_perm (w[ 6], w[ 5], selector); + w[14] = __byte_perm (w[ 5], w[ 4], selector); + w[13] = __byte_perm (w[ 4], w[ 3], selector); + w[12] = __byte_perm (w[ 3], w[ 2], selector); + w[11] = __byte_perm (w[ 2], w[ 1], selector); + w[10] = __byte_perm (w[ 1], w[ 0], selector); + w[ 9] = __byte_perm (w[ 0], 0, selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = __byte_perm (w[53], w[52], selector); + w[62] = __byte_perm (w[52], w[51], selector); + w[61] = __byte_perm (w[51], w[50], selector); + w[60] = __byte_perm (w[50], w[49], selector); + w[59] = __byte_perm (w[49], w[48], selector); + w[58] = __byte_perm (w[48], w[47], selector); + w[57] = __byte_perm (w[47], w[46], selector); + w[56] = __byte_perm (w[46], w[45], selector); + w[55] = __byte_perm (w[45], w[44], selector); + w[54] = __byte_perm (w[44], w[43], selector); + w[53] = __byte_perm (w[43], w[42], selector); + w[52] = __byte_perm (w[42], w[41], selector); + w[51] = __byte_perm (w[41], w[40], selector); + w[50] = __byte_perm (w[40], w[39], selector); + w[49] = __byte_perm (w[39], w[38], selector); + w[48] = __byte_perm (w[38], w[37], selector); + w[47] = __byte_perm (w[37], w[36], selector); + w[46] = __byte_perm (w[36], w[35], selector); + w[45] = __byte_perm (w[35], w[34], selector); + w[44] = __byte_perm (w[34], w[33], selector); + w[43] = __byte_perm (w[33], w[32], selector); + w[42] = __byte_perm (w[32], w[31], selector); + w[41] = __byte_perm (w[31], w[30], selector); + w[40] = __byte_perm (w[30], w[29], selector); + w[39] = __byte_perm (w[29], w[28], selector); + w[38] = __byte_perm (w[28], w[27], selector); + w[37] = __byte_perm (w[27], w[26], selector); + w[36] = __byte_perm (w[26], w[25], selector); + w[35] = __byte_perm (w[25], w[24], selector); + w[34] = __byte_perm (w[24], w[23], selector); + w[33] = __byte_perm (w[23], w[22], selector); + w[32] = __byte_perm (w[22], w[21], selector); + w[31] = __byte_perm (w[21], w[20], selector); + w[30] = __byte_perm (w[20], w[19], selector); + w[29] = __byte_perm (w[19], w[18], selector); + w[28] = __byte_perm (w[18], w[17], selector); + w[27] = __byte_perm (w[17], w[16], selector); + w[26] = __byte_perm (w[16], w[15], selector); + w[25] = __byte_perm (w[15], w[14], selector); + w[24] = __byte_perm (w[14], w[13], selector); + w[23] = __byte_perm (w[13], w[12], selector); + w[22] = __byte_perm (w[12], w[11], selector); + w[21] = __byte_perm (w[11], w[10], selector); + w[20] = __byte_perm (w[10], w[ 9], selector); + w[19] = __byte_perm (w[ 9], w[ 8], selector); + w[18] = __byte_perm (w[ 8], w[ 7], selector); + w[17] = __byte_perm (w[ 7], w[ 6], selector); + w[16] = __byte_perm (w[ 6], w[ 5], selector); + w[15] = __byte_perm (w[ 5], w[ 4], selector); + w[14] = __byte_perm (w[ 4], w[ 3], selector); + w[13] = __byte_perm (w[ 3], w[ 2], selector); + w[12] = __byte_perm (w[ 2], w[ 1], selector); + w[11] = __byte_perm (w[ 1], w[ 0], selector); + w[10] = __byte_perm (w[ 0], 0, selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = __byte_perm (w[52], w[51], selector); + w[62] = __byte_perm (w[51], w[50], selector); + w[61] = __byte_perm (w[50], w[49], selector); + w[60] = __byte_perm (w[49], w[48], selector); + w[59] = __byte_perm (w[48], w[47], selector); + w[58] = __byte_perm (w[47], w[46], selector); + w[57] = __byte_perm (w[46], w[45], selector); + w[56] = __byte_perm (w[45], w[44], selector); + w[55] = __byte_perm (w[44], w[43], selector); + w[54] = __byte_perm (w[43], w[42], selector); + w[53] = __byte_perm (w[42], w[41], selector); + w[52] = __byte_perm (w[41], w[40], selector); + w[51] = __byte_perm (w[40], w[39], selector); + w[50] = __byte_perm (w[39], w[38], selector); + w[49] = __byte_perm (w[38], w[37], selector); + w[48] = __byte_perm (w[37], w[36], selector); + w[47] = __byte_perm (w[36], w[35], selector); + w[46] = __byte_perm (w[35], w[34], selector); + w[45] = __byte_perm (w[34], w[33], selector); + w[44] = __byte_perm (w[33], w[32], selector); + w[43] = __byte_perm (w[32], w[31], selector); + w[42] = __byte_perm (w[31], w[30], selector); + w[41] = __byte_perm (w[30], w[29], selector); + w[40] = __byte_perm (w[29], w[28], selector); + w[39] = __byte_perm (w[28], w[27], selector); + w[38] = __byte_perm (w[27], w[26], selector); + w[37] = __byte_perm (w[26], w[25], selector); + w[36] = __byte_perm (w[25], w[24], selector); + w[35] = __byte_perm (w[24], w[23], selector); + w[34] = __byte_perm (w[23], w[22], selector); + w[33] = __byte_perm (w[22], w[21], selector); + w[32] = __byte_perm (w[21], w[20], selector); + w[31] = __byte_perm (w[20], w[19], selector); + w[30] = __byte_perm (w[19], w[18], selector); + w[29] = __byte_perm (w[18], w[17], selector); + w[28] = __byte_perm (w[17], w[16], selector); + w[27] = __byte_perm (w[16], w[15], selector); + w[26] = __byte_perm (w[15], w[14], selector); + w[25] = __byte_perm (w[14], w[13], selector); + w[24] = __byte_perm (w[13], w[12], selector); + w[23] = __byte_perm (w[12], w[11], selector); + w[22] = __byte_perm (w[11], w[10], selector); + w[21] = __byte_perm (w[10], w[ 9], selector); + w[20] = __byte_perm (w[ 9], w[ 8], selector); + w[19] = __byte_perm (w[ 8], w[ 7], selector); + w[18] = __byte_perm (w[ 7], w[ 6], selector); + w[17] = __byte_perm (w[ 6], w[ 5], selector); + w[16] = __byte_perm (w[ 5], w[ 4], selector); + w[15] = __byte_perm (w[ 4], w[ 3], selector); + w[14] = __byte_perm (w[ 3], w[ 2], selector); + w[13] = __byte_perm (w[ 2], w[ 1], selector); + w[12] = __byte_perm (w[ 1], w[ 0], selector); + w[11] = __byte_perm (w[ 0], 0, selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = __byte_perm (w[51], w[50], selector); + w[62] = __byte_perm (w[50], w[49], selector); + w[61] = __byte_perm (w[49], w[48], selector); + w[60] = __byte_perm (w[48], w[47], selector); + w[59] = __byte_perm (w[47], w[46], selector); + w[58] = __byte_perm (w[46], w[45], selector); + w[57] = __byte_perm (w[45], w[44], selector); + w[56] = __byte_perm (w[44], w[43], selector); + w[55] = __byte_perm (w[43], w[42], selector); + w[54] = __byte_perm (w[42], w[41], selector); + w[53] = __byte_perm (w[41], w[40], selector); + w[52] = __byte_perm (w[40], w[39], selector); + w[51] = __byte_perm (w[39], w[38], selector); + w[50] = __byte_perm (w[38], w[37], selector); + w[49] = __byte_perm (w[37], w[36], selector); + w[48] = __byte_perm (w[36], w[35], selector); + w[47] = __byte_perm (w[35], w[34], selector); + w[46] = __byte_perm (w[34], w[33], selector); + w[45] = __byte_perm (w[33], w[32], selector); + w[44] = __byte_perm (w[32], w[31], selector); + w[43] = __byte_perm (w[31], w[30], selector); + w[42] = __byte_perm (w[30], w[29], selector); + w[41] = __byte_perm (w[29], w[28], selector); + w[40] = __byte_perm (w[28], w[27], selector); + w[39] = __byte_perm (w[27], w[26], selector); + w[38] = __byte_perm (w[26], w[25], selector); + w[37] = __byte_perm (w[25], w[24], selector); + w[36] = __byte_perm (w[24], w[23], selector); + w[35] = __byte_perm (w[23], w[22], selector); + w[34] = __byte_perm (w[22], w[21], selector); + w[33] = __byte_perm (w[21], w[20], selector); + w[32] = __byte_perm (w[20], w[19], selector); + w[31] = __byte_perm (w[19], w[18], selector); + w[30] = __byte_perm (w[18], w[17], selector); + w[29] = __byte_perm (w[17], w[16], selector); + w[28] = __byte_perm (w[16], w[15], selector); + w[27] = __byte_perm (w[15], w[14], selector); + w[26] = __byte_perm (w[14], w[13], selector); + w[25] = __byte_perm (w[13], w[12], selector); + w[24] = __byte_perm (w[12], w[11], selector); + w[23] = __byte_perm (w[11], w[10], selector); + w[22] = __byte_perm (w[10], w[ 9], selector); + w[21] = __byte_perm (w[ 9], w[ 8], selector); + w[20] = __byte_perm (w[ 8], w[ 7], selector); + w[19] = __byte_perm (w[ 7], w[ 6], selector); + w[18] = __byte_perm (w[ 6], w[ 5], selector); + w[17] = __byte_perm (w[ 5], w[ 4], selector); + w[16] = __byte_perm (w[ 4], w[ 3], selector); + w[15] = __byte_perm (w[ 3], w[ 2], selector); + w[14] = __byte_perm (w[ 2], w[ 1], selector); + w[13] = __byte_perm (w[ 1], w[ 0], selector); + w[12] = __byte_perm (w[ 0], 0, selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = __byte_perm (w[50], w[49], selector); + w[62] = __byte_perm (w[49], w[48], selector); + w[61] = __byte_perm (w[48], w[47], selector); + w[60] = __byte_perm (w[47], w[46], selector); + w[59] = __byte_perm (w[46], w[45], selector); + w[58] = __byte_perm (w[45], w[44], selector); + w[57] = __byte_perm (w[44], w[43], selector); + w[56] = __byte_perm (w[43], w[42], selector); + w[55] = __byte_perm (w[42], w[41], selector); + w[54] = __byte_perm (w[41], w[40], selector); + w[53] = __byte_perm (w[40], w[39], selector); + w[52] = __byte_perm (w[39], w[38], selector); + w[51] = __byte_perm (w[38], w[37], selector); + w[50] = __byte_perm (w[37], w[36], selector); + w[49] = __byte_perm (w[36], w[35], selector); + w[48] = __byte_perm (w[35], w[34], selector); + w[47] = __byte_perm (w[34], w[33], selector); + w[46] = __byte_perm (w[33], w[32], selector); + w[45] = __byte_perm (w[32], w[31], selector); + w[44] = __byte_perm (w[31], w[30], selector); + w[43] = __byte_perm (w[30], w[29], selector); + w[42] = __byte_perm (w[29], w[28], selector); + w[41] = __byte_perm (w[28], w[27], selector); + w[40] = __byte_perm (w[27], w[26], selector); + w[39] = __byte_perm (w[26], w[25], selector); + w[38] = __byte_perm (w[25], w[24], selector); + w[37] = __byte_perm (w[24], w[23], selector); + w[36] = __byte_perm (w[23], w[22], selector); + w[35] = __byte_perm (w[22], w[21], selector); + w[34] = __byte_perm (w[21], w[20], selector); + w[33] = __byte_perm (w[20], w[19], selector); + w[32] = __byte_perm (w[19], w[18], selector); + w[31] = __byte_perm (w[18], w[17], selector); + w[30] = __byte_perm (w[17], w[16], selector); + w[29] = __byte_perm (w[16], w[15], selector); + w[28] = __byte_perm (w[15], w[14], selector); + w[27] = __byte_perm (w[14], w[13], selector); + w[26] = __byte_perm (w[13], w[12], selector); + w[25] = __byte_perm (w[12], w[11], selector); + w[24] = __byte_perm (w[11], w[10], selector); + w[23] = __byte_perm (w[10], w[ 9], selector); + w[22] = __byte_perm (w[ 9], w[ 8], selector); + w[21] = __byte_perm (w[ 8], w[ 7], selector); + w[20] = __byte_perm (w[ 7], w[ 6], selector); + w[19] = __byte_perm (w[ 6], w[ 5], selector); + w[18] = __byte_perm (w[ 5], w[ 4], selector); + w[17] = __byte_perm (w[ 4], w[ 3], selector); + w[16] = __byte_perm (w[ 3], w[ 2], selector); + w[15] = __byte_perm (w[ 2], w[ 1], selector); + w[14] = __byte_perm (w[ 1], w[ 0], selector); + w[13] = __byte_perm (w[ 0], 0, selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = __byte_perm (w[49], w[48], selector); + w[62] = __byte_perm (w[48], w[47], selector); + w[61] = __byte_perm (w[47], w[46], selector); + w[60] = __byte_perm (w[46], w[45], selector); + w[59] = __byte_perm (w[45], w[44], selector); + w[58] = __byte_perm (w[44], w[43], selector); + w[57] = __byte_perm (w[43], w[42], selector); + w[56] = __byte_perm (w[42], w[41], selector); + w[55] = __byte_perm (w[41], w[40], selector); + w[54] = __byte_perm (w[40], w[39], selector); + w[53] = __byte_perm (w[39], w[38], selector); + w[52] = __byte_perm (w[38], w[37], selector); + w[51] = __byte_perm (w[37], w[36], selector); + w[50] = __byte_perm (w[36], w[35], selector); + w[49] = __byte_perm (w[35], w[34], selector); + w[48] = __byte_perm (w[34], w[33], selector); + w[47] = __byte_perm (w[33], w[32], selector); + w[46] = __byte_perm (w[32], w[31], selector); + w[45] = __byte_perm (w[31], w[30], selector); + w[44] = __byte_perm (w[30], w[29], selector); + w[43] = __byte_perm (w[29], w[28], selector); + w[42] = __byte_perm (w[28], w[27], selector); + w[41] = __byte_perm (w[27], w[26], selector); + w[40] = __byte_perm (w[26], w[25], selector); + w[39] = __byte_perm (w[25], w[24], selector); + w[38] = __byte_perm (w[24], w[23], selector); + w[37] = __byte_perm (w[23], w[22], selector); + w[36] = __byte_perm (w[22], w[21], selector); + w[35] = __byte_perm (w[21], w[20], selector); + w[34] = __byte_perm (w[20], w[19], selector); + w[33] = __byte_perm (w[19], w[18], selector); + w[32] = __byte_perm (w[18], w[17], selector); + w[31] = __byte_perm (w[17], w[16], selector); + w[30] = __byte_perm (w[16], w[15], selector); + w[29] = __byte_perm (w[15], w[14], selector); + w[28] = __byte_perm (w[14], w[13], selector); + w[27] = __byte_perm (w[13], w[12], selector); + w[26] = __byte_perm (w[12], w[11], selector); + w[25] = __byte_perm (w[11], w[10], selector); + w[24] = __byte_perm (w[10], w[ 9], selector); + w[23] = __byte_perm (w[ 9], w[ 8], selector); + w[22] = __byte_perm (w[ 8], w[ 7], selector); + w[21] = __byte_perm (w[ 7], w[ 6], selector); + w[20] = __byte_perm (w[ 6], w[ 5], selector); + w[19] = __byte_perm (w[ 5], w[ 4], selector); + w[18] = __byte_perm (w[ 4], w[ 3], selector); + w[17] = __byte_perm (w[ 3], w[ 2], selector); + w[16] = __byte_perm (w[ 2], w[ 1], selector); + w[15] = __byte_perm (w[ 1], w[ 0], selector); + w[14] = __byte_perm (w[ 0], 0, selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = __byte_perm (w[48], w[47], selector); + w[62] = __byte_perm (w[47], w[46], selector); + w[61] = __byte_perm (w[46], w[45], selector); + w[60] = __byte_perm (w[45], w[44], selector); + w[59] = __byte_perm (w[44], w[43], selector); + w[58] = __byte_perm (w[43], w[42], selector); + w[57] = __byte_perm (w[42], w[41], selector); + w[56] = __byte_perm (w[41], w[40], selector); + w[55] = __byte_perm (w[40], w[39], selector); + w[54] = __byte_perm (w[39], w[38], selector); + w[53] = __byte_perm (w[38], w[37], selector); + w[52] = __byte_perm (w[37], w[36], selector); + w[51] = __byte_perm (w[36], w[35], selector); + w[50] = __byte_perm (w[35], w[34], selector); + w[49] = __byte_perm (w[34], w[33], selector); + w[48] = __byte_perm (w[33], w[32], selector); + w[47] = __byte_perm (w[32], w[31], selector); + w[46] = __byte_perm (w[31], w[30], selector); + w[45] = __byte_perm (w[30], w[29], selector); + w[44] = __byte_perm (w[29], w[28], selector); + w[43] = __byte_perm (w[28], w[27], selector); + w[42] = __byte_perm (w[27], w[26], selector); + w[41] = __byte_perm (w[26], w[25], selector); + w[40] = __byte_perm (w[25], w[24], selector); + w[39] = __byte_perm (w[24], w[23], selector); + w[38] = __byte_perm (w[23], w[22], selector); + w[37] = __byte_perm (w[22], w[21], selector); + w[36] = __byte_perm (w[21], w[20], selector); + w[35] = __byte_perm (w[20], w[19], selector); + w[34] = __byte_perm (w[19], w[18], selector); + w[33] = __byte_perm (w[18], w[17], selector); + w[32] = __byte_perm (w[17], w[16], selector); + w[31] = __byte_perm (w[16], w[15], selector); + w[30] = __byte_perm (w[15], w[14], selector); + w[29] = __byte_perm (w[14], w[13], selector); + w[28] = __byte_perm (w[13], w[12], selector); + w[27] = __byte_perm (w[12], w[11], selector); + w[26] = __byte_perm (w[11], w[10], selector); + w[25] = __byte_perm (w[10], w[ 9], selector); + w[24] = __byte_perm (w[ 9], w[ 8], selector); + w[23] = __byte_perm (w[ 8], w[ 7], selector); + w[22] = __byte_perm (w[ 7], w[ 6], selector); + w[21] = __byte_perm (w[ 6], w[ 5], selector); + w[20] = __byte_perm (w[ 5], w[ 4], selector); + w[19] = __byte_perm (w[ 4], w[ 3], selector); + w[18] = __byte_perm (w[ 3], w[ 2], selector); + w[17] = __byte_perm (w[ 2], w[ 1], selector); + w[16] = __byte_perm (w[ 1], w[ 0], selector); + w[15] = __byte_perm (w[ 0], 0, selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = __byte_perm (w[47], w[46], selector); + w[62] = __byte_perm (w[46], w[45], selector); + w[61] = __byte_perm (w[45], w[44], selector); + w[60] = __byte_perm (w[44], w[43], selector); + w[59] = __byte_perm (w[43], w[42], selector); + w[58] = __byte_perm (w[42], w[41], selector); + w[57] = __byte_perm (w[41], w[40], selector); + w[56] = __byte_perm (w[40], w[39], selector); + w[55] = __byte_perm (w[39], w[38], selector); + w[54] = __byte_perm (w[38], w[37], selector); + w[53] = __byte_perm (w[37], w[36], selector); + w[52] = __byte_perm (w[36], w[35], selector); + w[51] = __byte_perm (w[35], w[34], selector); + w[50] = __byte_perm (w[34], w[33], selector); + w[49] = __byte_perm (w[33], w[32], selector); + w[48] = __byte_perm (w[32], w[31], selector); + w[47] = __byte_perm (w[31], w[30], selector); + w[46] = __byte_perm (w[30], w[29], selector); + w[45] = __byte_perm (w[29], w[28], selector); + w[44] = __byte_perm (w[28], w[27], selector); + w[43] = __byte_perm (w[27], w[26], selector); + w[42] = __byte_perm (w[26], w[25], selector); + w[41] = __byte_perm (w[25], w[24], selector); + w[40] = __byte_perm (w[24], w[23], selector); + w[39] = __byte_perm (w[23], w[22], selector); + w[38] = __byte_perm (w[22], w[21], selector); + w[37] = __byte_perm (w[21], w[20], selector); + w[36] = __byte_perm (w[20], w[19], selector); + w[35] = __byte_perm (w[19], w[18], selector); + w[34] = __byte_perm (w[18], w[17], selector); + w[33] = __byte_perm (w[17], w[16], selector); + w[32] = __byte_perm (w[16], w[15], selector); + w[31] = __byte_perm (w[15], w[14], selector); + w[30] = __byte_perm (w[14], w[13], selector); + w[29] = __byte_perm (w[13], w[12], selector); + w[28] = __byte_perm (w[12], w[11], selector); + w[27] = __byte_perm (w[11], w[10], selector); + w[26] = __byte_perm (w[10], w[ 9], selector); + w[25] = __byte_perm (w[ 9], w[ 8], selector); + w[24] = __byte_perm (w[ 8], w[ 7], selector); + w[23] = __byte_perm (w[ 7], w[ 6], selector); + w[22] = __byte_perm (w[ 6], w[ 5], selector); + w[21] = __byte_perm (w[ 5], w[ 4], selector); + w[20] = __byte_perm (w[ 4], w[ 3], selector); + w[19] = __byte_perm (w[ 3], w[ 2], selector); + w[18] = __byte_perm (w[ 2], w[ 1], selector); + w[17] = __byte_perm (w[ 1], w[ 0], selector); + w[16] = __byte_perm (w[ 0], 0, selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = __byte_perm (w[46], w[45], selector); + w[62] = __byte_perm (w[45], w[44], selector); + w[61] = __byte_perm (w[44], w[43], selector); + w[60] = __byte_perm (w[43], w[42], selector); + w[59] = __byte_perm (w[42], w[41], selector); + w[58] = __byte_perm (w[41], w[40], selector); + w[57] = __byte_perm (w[40], w[39], selector); + w[56] = __byte_perm (w[39], w[38], selector); + w[55] = __byte_perm (w[38], w[37], selector); + w[54] = __byte_perm (w[37], w[36], selector); + w[53] = __byte_perm (w[36], w[35], selector); + w[52] = __byte_perm (w[35], w[34], selector); + w[51] = __byte_perm (w[34], w[33], selector); + w[50] = __byte_perm (w[33], w[32], selector); + w[49] = __byte_perm (w[32], w[31], selector); + w[48] = __byte_perm (w[31], w[30], selector); + w[47] = __byte_perm (w[30], w[29], selector); + w[46] = __byte_perm (w[29], w[28], selector); + w[45] = __byte_perm (w[28], w[27], selector); + w[44] = __byte_perm (w[27], w[26], selector); + w[43] = __byte_perm (w[26], w[25], selector); + w[42] = __byte_perm (w[25], w[24], selector); + w[41] = __byte_perm (w[24], w[23], selector); + w[40] = __byte_perm (w[23], w[22], selector); + w[39] = __byte_perm (w[22], w[21], selector); + w[38] = __byte_perm (w[21], w[20], selector); + w[37] = __byte_perm (w[20], w[19], selector); + w[36] = __byte_perm (w[19], w[18], selector); + w[35] = __byte_perm (w[18], w[17], selector); + w[34] = __byte_perm (w[17], w[16], selector); + w[33] = __byte_perm (w[16], w[15], selector); + w[32] = __byte_perm (w[15], w[14], selector); + w[31] = __byte_perm (w[14], w[13], selector); + w[30] = __byte_perm (w[13], w[12], selector); + w[29] = __byte_perm (w[12], w[11], selector); + w[28] = __byte_perm (w[11], w[10], selector); + w[27] = __byte_perm (w[10], w[ 9], selector); + w[26] = __byte_perm (w[ 9], w[ 8], selector); + w[25] = __byte_perm (w[ 8], w[ 7], selector); + w[24] = __byte_perm (w[ 7], w[ 6], selector); + w[23] = __byte_perm (w[ 6], w[ 5], selector); + w[22] = __byte_perm (w[ 5], w[ 4], selector); + w[21] = __byte_perm (w[ 4], w[ 3], selector); + w[20] = __byte_perm (w[ 3], w[ 2], selector); + w[19] = __byte_perm (w[ 2], w[ 1], selector); + w[18] = __byte_perm (w[ 1], w[ 0], selector); + w[17] = __byte_perm (w[ 0], 0, selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = __byte_perm (w[45], w[44], selector); + w[62] = __byte_perm (w[44], w[43], selector); + w[61] = __byte_perm (w[43], w[42], selector); + w[60] = __byte_perm (w[42], w[41], selector); + w[59] = __byte_perm (w[41], w[40], selector); + w[58] = __byte_perm (w[40], w[39], selector); + w[57] = __byte_perm (w[39], w[38], selector); + w[56] = __byte_perm (w[38], w[37], selector); + w[55] = __byte_perm (w[37], w[36], selector); + w[54] = __byte_perm (w[36], w[35], selector); + w[53] = __byte_perm (w[35], w[34], selector); + w[52] = __byte_perm (w[34], w[33], selector); + w[51] = __byte_perm (w[33], w[32], selector); + w[50] = __byte_perm (w[32], w[31], selector); + w[49] = __byte_perm (w[31], w[30], selector); + w[48] = __byte_perm (w[30], w[29], selector); + w[47] = __byte_perm (w[29], w[28], selector); + w[46] = __byte_perm (w[28], w[27], selector); + w[45] = __byte_perm (w[27], w[26], selector); + w[44] = __byte_perm (w[26], w[25], selector); + w[43] = __byte_perm (w[25], w[24], selector); + w[42] = __byte_perm (w[24], w[23], selector); + w[41] = __byte_perm (w[23], w[22], selector); + w[40] = __byte_perm (w[22], w[21], selector); + w[39] = __byte_perm (w[21], w[20], selector); + w[38] = __byte_perm (w[20], w[19], selector); + w[37] = __byte_perm (w[19], w[18], selector); + w[36] = __byte_perm (w[18], w[17], selector); + w[35] = __byte_perm (w[17], w[16], selector); + w[34] = __byte_perm (w[16], w[15], selector); + w[33] = __byte_perm (w[15], w[14], selector); + w[32] = __byte_perm (w[14], w[13], selector); + w[31] = __byte_perm (w[13], w[12], selector); + w[30] = __byte_perm (w[12], w[11], selector); + w[29] = __byte_perm (w[11], w[10], selector); + w[28] = __byte_perm (w[10], w[ 9], selector); + w[27] = __byte_perm (w[ 9], w[ 8], selector); + w[26] = __byte_perm (w[ 8], w[ 7], selector); + w[25] = __byte_perm (w[ 7], w[ 6], selector); + w[24] = __byte_perm (w[ 6], w[ 5], selector); + w[23] = __byte_perm (w[ 5], w[ 4], selector); + w[22] = __byte_perm (w[ 4], w[ 3], selector); + w[21] = __byte_perm (w[ 3], w[ 2], selector); + w[20] = __byte_perm (w[ 2], w[ 1], selector); + w[19] = __byte_perm (w[ 1], w[ 0], selector); + w[18] = __byte_perm (w[ 0], 0, selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = __byte_perm (w[44], w[43], selector); + w[62] = __byte_perm (w[43], w[42], selector); + w[61] = __byte_perm (w[42], w[41], selector); + w[60] = __byte_perm (w[41], w[40], selector); + w[59] = __byte_perm (w[40], w[39], selector); + w[58] = __byte_perm (w[39], w[38], selector); + w[57] = __byte_perm (w[38], w[37], selector); + w[56] = __byte_perm (w[37], w[36], selector); + w[55] = __byte_perm (w[36], w[35], selector); + w[54] = __byte_perm (w[35], w[34], selector); + w[53] = __byte_perm (w[34], w[33], selector); + w[52] = __byte_perm (w[33], w[32], selector); + w[51] = __byte_perm (w[32], w[31], selector); + w[50] = __byte_perm (w[31], w[30], selector); + w[49] = __byte_perm (w[30], w[29], selector); + w[48] = __byte_perm (w[29], w[28], selector); + w[47] = __byte_perm (w[28], w[27], selector); + w[46] = __byte_perm (w[27], w[26], selector); + w[45] = __byte_perm (w[26], w[25], selector); + w[44] = __byte_perm (w[25], w[24], selector); + w[43] = __byte_perm (w[24], w[23], selector); + w[42] = __byte_perm (w[23], w[22], selector); + w[41] = __byte_perm (w[22], w[21], selector); + w[40] = __byte_perm (w[21], w[20], selector); + w[39] = __byte_perm (w[20], w[19], selector); + w[38] = __byte_perm (w[19], w[18], selector); + w[37] = __byte_perm (w[18], w[17], selector); + w[36] = __byte_perm (w[17], w[16], selector); + w[35] = __byte_perm (w[16], w[15], selector); + w[34] = __byte_perm (w[15], w[14], selector); + w[33] = __byte_perm (w[14], w[13], selector); + w[32] = __byte_perm (w[13], w[12], selector); + w[31] = __byte_perm (w[12], w[11], selector); + w[30] = __byte_perm (w[11], w[10], selector); + w[29] = __byte_perm (w[10], w[ 9], selector); + w[28] = __byte_perm (w[ 9], w[ 8], selector); + w[27] = __byte_perm (w[ 8], w[ 7], selector); + w[26] = __byte_perm (w[ 7], w[ 6], selector); + w[25] = __byte_perm (w[ 6], w[ 5], selector); + w[24] = __byte_perm (w[ 5], w[ 4], selector); + w[23] = __byte_perm (w[ 4], w[ 3], selector); + w[22] = __byte_perm (w[ 3], w[ 2], selector); + w[21] = __byte_perm (w[ 2], w[ 1], selector); + w[20] = __byte_perm (w[ 1], w[ 0], selector); + w[19] = __byte_perm (w[ 0], 0, selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = __byte_perm (w[43], w[42], selector); + w[62] = __byte_perm (w[42], w[41], selector); + w[61] = __byte_perm (w[41], w[40], selector); + w[60] = __byte_perm (w[40], w[39], selector); + w[59] = __byte_perm (w[39], w[38], selector); + w[58] = __byte_perm (w[38], w[37], selector); + w[57] = __byte_perm (w[37], w[36], selector); + w[56] = __byte_perm (w[36], w[35], selector); + w[55] = __byte_perm (w[35], w[34], selector); + w[54] = __byte_perm (w[34], w[33], selector); + w[53] = __byte_perm (w[33], w[32], selector); + w[52] = __byte_perm (w[32], w[31], selector); + w[51] = __byte_perm (w[31], w[30], selector); + w[50] = __byte_perm (w[30], w[29], selector); + w[49] = __byte_perm (w[29], w[28], selector); + w[48] = __byte_perm (w[28], w[27], selector); + w[47] = __byte_perm (w[27], w[26], selector); + w[46] = __byte_perm (w[26], w[25], selector); + w[45] = __byte_perm (w[25], w[24], selector); + w[44] = __byte_perm (w[24], w[23], selector); + w[43] = __byte_perm (w[23], w[22], selector); + w[42] = __byte_perm (w[22], w[21], selector); + w[41] = __byte_perm (w[21], w[20], selector); + w[40] = __byte_perm (w[20], w[19], selector); + w[39] = __byte_perm (w[19], w[18], selector); + w[38] = __byte_perm (w[18], w[17], selector); + w[37] = __byte_perm (w[17], w[16], selector); + w[36] = __byte_perm (w[16], w[15], selector); + w[35] = __byte_perm (w[15], w[14], selector); + w[34] = __byte_perm (w[14], w[13], selector); + w[33] = __byte_perm (w[13], w[12], selector); + w[32] = __byte_perm (w[12], w[11], selector); + w[31] = __byte_perm (w[11], w[10], selector); + w[30] = __byte_perm (w[10], w[ 9], selector); + w[29] = __byte_perm (w[ 9], w[ 8], selector); + w[28] = __byte_perm (w[ 8], w[ 7], selector); + w[27] = __byte_perm (w[ 7], w[ 6], selector); + w[26] = __byte_perm (w[ 6], w[ 5], selector); + w[25] = __byte_perm (w[ 5], w[ 4], selector); + w[24] = __byte_perm (w[ 4], w[ 3], selector); + w[23] = __byte_perm (w[ 3], w[ 2], selector); + w[22] = __byte_perm (w[ 2], w[ 1], selector); + w[21] = __byte_perm (w[ 1], w[ 0], selector); + w[20] = __byte_perm (w[ 0], 0, selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = __byte_perm (w[42], w[41], selector); + w[62] = __byte_perm (w[41], w[40], selector); + w[61] = __byte_perm (w[40], w[39], selector); + w[60] = __byte_perm (w[39], w[38], selector); + w[59] = __byte_perm (w[38], w[37], selector); + w[58] = __byte_perm (w[37], w[36], selector); + w[57] = __byte_perm (w[36], w[35], selector); + w[56] = __byte_perm (w[35], w[34], selector); + w[55] = __byte_perm (w[34], w[33], selector); + w[54] = __byte_perm (w[33], w[32], selector); + w[53] = __byte_perm (w[32], w[31], selector); + w[52] = __byte_perm (w[31], w[30], selector); + w[51] = __byte_perm (w[30], w[29], selector); + w[50] = __byte_perm (w[29], w[28], selector); + w[49] = __byte_perm (w[28], w[27], selector); + w[48] = __byte_perm (w[27], w[26], selector); + w[47] = __byte_perm (w[26], w[25], selector); + w[46] = __byte_perm (w[25], w[24], selector); + w[45] = __byte_perm (w[24], w[23], selector); + w[44] = __byte_perm (w[23], w[22], selector); + w[43] = __byte_perm (w[22], w[21], selector); + w[42] = __byte_perm (w[21], w[20], selector); + w[41] = __byte_perm (w[20], w[19], selector); + w[40] = __byte_perm (w[19], w[18], selector); + w[39] = __byte_perm (w[18], w[17], selector); + w[38] = __byte_perm (w[17], w[16], selector); + w[37] = __byte_perm (w[16], w[15], selector); + w[36] = __byte_perm (w[15], w[14], selector); + w[35] = __byte_perm (w[14], w[13], selector); + w[34] = __byte_perm (w[13], w[12], selector); + w[33] = __byte_perm (w[12], w[11], selector); + w[32] = __byte_perm (w[11], w[10], selector); + w[31] = __byte_perm (w[10], w[ 9], selector); + w[30] = __byte_perm (w[ 9], w[ 8], selector); + w[29] = __byte_perm (w[ 8], w[ 7], selector); + w[28] = __byte_perm (w[ 7], w[ 6], selector); + w[27] = __byte_perm (w[ 6], w[ 5], selector); + w[26] = __byte_perm (w[ 5], w[ 4], selector); + w[25] = __byte_perm (w[ 4], w[ 3], selector); + w[24] = __byte_perm (w[ 3], w[ 2], selector); + w[23] = __byte_perm (w[ 2], w[ 1], selector); + w[22] = __byte_perm (w[ 1], w[ 0], selector); + w[21] = __byte_perm (w[ 0], 0, selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = __byte_perm (w[41], w[40], selector); + w[62] = __byte_perm (w[40], w[39], selector); + w[61] = __byte_perm (w[39], w[38], selector); + w[60] = __byte_perm (w[38], w[37], selector); + w[59] = __byte_perm (w[37], w[36], selector); + w[58] = __byte_perm (w[36], w[35], selector); + w[57] = __byte_perm (w[35], w[34], selector); + w[56] = __byte_perm (w[34], w[33], selector); + w[55] = __byte_perm (w[33], w[32], selector); + w[54] = __byte_perm (w[32], w[31], selector); + w[53] = __byte_perm (w[31], w[30], selector); + w[52] = __byte_perm (w[30], w[29], selector); + w[51] = __byte_perm (w[29], w[28], selector); + w[50] = __byte_perm (w[28], w[27], selector); + w[49] = __byte_perm (w[27], w[26], selector); + w[48] = __byte_perm (w[26], w[25], selector); + w[47] = __byte_perm (w[25], w[24], selector); + w[46] = __byte_perm (w[24], w[23], selector); + w[45] = __byte_perm (w[23], w[22], selector); + w[44] = __byte_perm (w[22], w[21], selector); + w[43] = __byte_perm (w[21], w[20], selector); + w[42] = __byte_perm (w[20], w[19], selector); + w[41] = __byte_perm (w[19], w[18], selector); + w[40] = __byte_perm (w[18], w[17], selector); + w[39] = __byte_perm (w[17], w[16], selector); + w[38] = __byte_perm (w[16], w[15], selector); + w[37] = __byte_perm (w[15], w[14], selector); + w[36] = __byte_perm (w[14], w[13], selector); + w[35] = __byte_perm (w[13], w[12], selector); + w[34] = __byte_perm (w[12], w[11], selector); + w[33] = __byte_perm (w[11], w[10], selector); + w[32] = __byte_perm (w[10], w[ 9], selector); + w[31] = __byte_perm (w[ 9], w[ 8], selector); + w[30] = __byte_perm (w[ 8], w[ 7], selector); + w[29] = __byte_perm (w[ 7], w[ 6], selector); + w[28] = __byte_perm (w[ 6], w[ 5], selector); + w[27] = __byte_perm (w[ 5], w[ 4], selector); + w[26] = __byte_perm (w[ 4], w[ 3], selector); + w[25] = __byte_perm (w[ 3], w[ 2], selector); + w[24] = __byte_perm (w[ 2], w[ 1], selector); + w[23] = __byte_perm (w[ 1], w[ 0], selector); + w[22] = __byte_perm (w[ 0], 0, selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = __byte_perm (w[40], w[39], selector); + w[62] = __byte_perm (w[39], w[38], selector); + w[61] = __byte_perm (w[38], w[37], selector); + w[60] = __byte_perm (w[37], w[36], selector); + w[59] = __byte_perm (w[36], w[35], selector); + w[58] = __byte_perm (w[35], w[34], selector); + w[57] = __byte_perm (w[34], w[33], selector); + w[56] = __byte_perm (w[33], w[32], selector); + w[55] = __byte_perm (w[32], w[31], selector); + w[54] = __byte_perm (w[31], w[30], selector); + w[53] = __byte_perm (w[30], w[29], selector); + w[52] = __byte_perm (w[29], w[28], selector); + w[51] = __byte_perm (w[28], w[27], selector); + w[50] = __byte_perm (w[27], w[26], selector); + w[49] = __byte_perm (w[26], w[25], selector); + w[48] = __byte_perm (w[25], w[24], selector); + w[47] = __byte_perm (w[24], w[23], selector); + w[46] = __byte_perm (w[23], w[22], selector); + w[45] = __byte_perm (w[22], w[21], selector); + w[44] = __byte_perm (w[21], w[20], selector); + w[43] = __byte_perm (w[20], w[19], selector); + w[42] = __byte_perm (w[19], w[18], selector); + w[41] = __byte_perm (w[18], w[17], selector); + w[40] = __byte_perm (w[17], w[16], selector); + w[39] = __byte_perm (w[16], w[15], selector); + w[38] = __byte_perm (w[15], w[14], selector); + w[37] = __byte_perm (w[14], w[13], selector); + w[36] = __byte_perm (w[13], w[12], selector); + w[35] = __byte_perm (w[12], w[11], selector); + w[34] = __byte_perm (w[11], w[10], selector); + w[33] = __byte_perm (w[10], w[ 9], selector); + w[32] = __byte_perm (w[ 9], w[ 8], selector); + w[31] = __byte_perm (w[ 8], w[ 7], selector); + w[30] = __byte_perm (w[ 7], w[ 6], selector); + w[29] = __byte_perm (w[ 6], w[ 5], selector); + w[28] = __byte_perm (w[ 5], w[ 4], selector); + w[27] = __byte_perm (w[ 4], w[ 3], selector); + w[26] = __byte_perm (w[ 3], w[ 2], selector); + w[25] = __byte_perm (w[ 2], w[ 1], selector); + w[24] = __byte_perm (w[ 1], w[ 0], selector); + w[23] = __byte_perm (w[ 0], 0, selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = __byte_perm (w[39], w[38], selector); + w[62] = __byte_perm (w[38], w[37], selector); + w[61] = __byte_perm (w[37], w[36], selector); + w[60] = __byte_perm (w[36], w[35], selector); + w[59] = __byte_perm (w[35], w[34], selector); + w[58] = __byte_perm (w[34], w[33], selector); + w[57] = __byte_perm (w[33], w[32], selector); + w[56] = __byte_perm (w[32], w[31], selector); + w[55] = __byte_perm (w[31], w[30], selector); + w[54] = __byte_perm (w[30], w[29], selector); + w[53] = __byte_perm (w[29], w[28], selector); + w[52] = __byte_perm (w[28], w[27], selector); + w[51] = __byte_perm (w[27], w[26], selector); + w[50] = __byte_perm (w[26], w[25], selector); + w[49] = __byte_perm (w[25], w[24], selector); + w[48] = __byte_perm (w[24], w[23], selector); + w[47] = __byte_perm (w[23], w[22], selector); + w[46] = __byte_perm (w[22], w[21], selector); + w[45] = __byte_perm (w[21], w[20], selector); + w[44] = __byte_perm (w[20], w[19], selector); + w[43] = __byte_perm (w[19], w[18], selector); + w[42] = __byte_perm (w[18], w[17], selector); + w[41] = __byte_perm (w[17], w[16], selector); + w[40] = __byte_perm (w[16], w[15], selector); + w[39] = __byte_perm (w[15], w[14], selector); + w[38] = __byte_perm (w[14], w[13], selector); + w[37] = __byte_perm (w[13], w[12], selector); + w[36] = __byte_perm (w[12], w[11], selector); + w[35] = __byte_perm (w[11], w[10], selector); + w[34] = __byte_perm (w[10], w[ 9], selector); + w[33] = __byte_perm (w[ 9], w[ 8], selector); + w[32] = __byte_perm (w[ 8], w[ 7], selector); + w[31] = __byte_perm (w[ 7], w[ 6], selector); + w[30] = __byte_perm (w[ 6], w[ 5], selector); + w[29] = __byte_perm (w[ 5], w[ 4], selector); + w[28] = __byte_perm (w[ 4], w[ 3], selector); + w[27] = __byte_perm (w[ 3], w[ 2], selector); + w[26] = __byte_perm (w[ 2], w[ 1], selector); + w[25] = __byte_perm (w[ 1], w[ 0], selector); + w[24] = __byte_perm (w[ 0], 0, selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = __byte_perm (w[38], w[37], selector); + w[62] = __byte_perm (w[37], w[36], selector); + w[61] = __byte_perm (w[36], w[35], selector); + w[60] = __byte_perm (w[35], w[34], selector); + w[59] = __byte_perm (w[34], w[33], selector); + w[58] = __byte_perm (w[33], w[32], selector); + w[57] = __byte_perm (w[32], w[31], selector); + w[56] = __byte_perm (w[31], w[30], selector); + w[55] = __byte_perm (w[30], w[29], selector); + w[54] = __byte_perm (w[29], w[28], selector); + w[53] = __byte_perm (w[28], w[27], selector); + w[52] = __byte_perm (w[27], w[26], selector); + w[51] = __byte_perm (w[26], w[25], selector); + w[50] = __byte_perm (w[25], w[24], selector); + w[49] = __byte_perm (w[24], w[23], selector); + w[48] = __byte_perm (w[23], w[22], selector); + w[47] = __byte_perm (w[22], w[21], selector); + w[46] = __byte_perm (w[21], w[20], selector); + w[45] = __byte_perm (w[20], w[19], selector); + w[44] = __byte_perm (w[19], w[18], selector); + w[43] = __byte_perm (w[18], w[17], selector); + w[42] = __byte_perm (w[17], w[16], selector); + w[41] = __byte_perm (w[16], w[15], selector); + w[40] = __byte_perm (w[15], w[14], selector); + w[39] = __byte_perm (w[14], w[13], selector); + w[38] = __byte_perm (w[13], w[12], selector); + w[37] = __byte_perm (w[12], w[11], selector); + w[36] = __byte_perm (w[11], w[10], selector); + w[35] = __byte_perm (w[10], w[ 9], selector); + w[34] = __byte_perm (w[ 9], w[ 8], selector); + w[33] = __byte_perm (w[ 8], w[ 7], selector); + w[32] = __byte_perm (w[ 7], w[ 6], selector); + w[31] = __byte_perm (w[ 6], w[ 5], selector); + w[30] = __byte_perm (w[ 5], w[ 4], selector); + w[29] = __byte_perm (w[ 4], w[ 3], selector); + w[28] = __byte_perm (w[ 3], w[ 2], selector); + w[27] = __byte_perm (w[ 2], w[ 1], selector); + w[26] = __byte_perm (w[ 1], w[ 0], selector); + w[25] = __byte_perm (w[ 0], 0, selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = __byte_perm (w[37], w[36], selector); + w[62] = __byte_perm (w[36], w[35], selector); + w[61] = __byte_perm (w[35], w[34], selector); + w[60] = __byte_perm (w[34], w[33], selector); + w[59] = __byte_perm (w[33], w[32], selector); + w[58] = __byte_perm (w[32], w[31], selector); + w[57] = __byte_perm (w[31], w[30], selector); + w[56] = __byte_perm (w[30], w[29], selector); + w[55] = __byte_perm (w[29], w[28], selector); + w[54] = __byte_perm (w[28], w[27], selector); + w[53] = __byte_perm (w[27], w[26], selector); + w[52] = __byte_perm (w[26], w[25], selector); + w[51] = __byte_perm (w[25], w[24], selector); + w[50] = __byte_perm (w[24], w[23], selector); + w[49] = __byte_perm (w[23], w[22], selector); + w[48] = __byte_perm (w[22], w[21], selector); + w[47] = __byte_perm (w[21], w[20], selector); + w[46] = __byte_perm (w[20], w[19], selector); + w[45] = __byte_perm (w[19], w[18], selector); + w[44] = __byte_perm (w[18], w[17], selector); + w[43] = __byte_perm (w[17], w[16], selector); + w[42] = __byte_perm (w[16], w[15], selector); + w[41] = __byte_perm (w[15], w[14], selector); + w[40] = __byte_perm (w[14], w[13], selector); + w[39] = __byte_perm (w[13], w[12], selector); + w[38] = __byte_perm (w[12], w[11], selector); + w[37] = __byte_perm (w[11], w[10], selector); + w[36] = __byte_perm (w[10], w[ 9], selector); + w[35] = __byte_perm (w[ 9], w[ 8], selector); + w[34] = __byte_perm (w[ 8], w[ 7], selector); + w[33] = __byte_perm (w[ 7], w[ 6], selector); + w[32] = __byte_perm (w[ 6], w[ 5], selector); + w[31] = __byte_perm (w[ 5], w[ 4], selector); + w[30] = __byte_perm (w[ 4], w[ 3], selector); + w[29] = __byte_perm (w[ 3], w[ 2], selector); + w[28] = __byte_perm (w[ 2], w[ 1], selector); + w[27] = __byte_perm (w[ 1], w[ 0], selector); + w[26] = __byte_perm (w[ 0], 0, selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = __byte_perm (w[36], w[35], selector); + w[62] = __byte_perm (w[35], w[34], selector); + w[61] = __byte_perm (w[34], w[33], selector); + w[60] = __byte_perm (w[33], w[32], selector); + w[59] = __byte_perm (w[32], w[31], selector); + w[58] = __byte_perm (w[31], w[30], selector); + w[57] = __byte_perm (w[30], w[29], selector); + w[56] = __byte_perm (w[29], w[28], selector); + w[55] = __byte_perm (w[28], w[27], selector); + w[54] = __byte_perm (w[27], w[26], selector); + w[53] = __byte_perm (w[26], w[25], selector); + w[52] = __byte_perm (w[25], w[24], selector); + w[51] = __byte_perm (w[24], w[23], selector); + w[50] = __byte_perm (w[23], w[22], selector); + w[49] = __byte_perm (w[22], w[21], selector); + w[48] = __byte_perm (w[21], w[20], selector); + w[47] = __byte_perm (w[20], w[19], selector); + w[46] = __byte_perm (w[19], w[18], selector); + w[45] = __byte_perm (w[18], w[17], selector); + w[44] = __byte_perm (w[17], w[16], selector); + w[43] = __byte_perm (w[16], w[15], selector); + w[42] = __byte_perm (w[15], w[14], selector); + w[41] = __byte_perm (w[14], w[13], selector); + w[40] = __byte_perm (w[13], w[12], selector); + w[39] = __byte_perm (w[12], w[11], selector); + w[38] = __byte_perm (w[11], w[10], selector); + w[37] = __byte_perm (w[10], w[ 9], selector); + w[36] = __byte_perm (w[ 9], w[ 8], selector); + w[35] = __byte_perm (w[ 8], w[ 7], selector); + w[34] = __byte_perm (w[ 7], w[ 6], selector); + w[33] = __byte_perm (w[ 6], w[ 5], selector); + w[32] = __byte_perm (w[ 5], w[ 4], selector); + w[31] = __byte_perm (w[ 4], w[ 3], selector); + w[30] = __byte_perm (w[ 3], w[ 2], selector); + w[29] = __byte_perm (w[ 2], w[ 1], selector); + w[28] = __byte_perm (w[ 1], w[ 0], selector); + w[27] = __byte_perm (w[ 0], 0, selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = __byte_perm (w[35], w[34], selector); + w[62] = __byte_perm (w[34], w[33], selector); + w[61] = __byte_perm (w[33], w[32], selector); + w[60] = __byte_perm (w[32], w[31], selector); + w[59] = __byte_perm (w[31], w[30], selector); + w[58] = __byte_perm (w[30], w[29], selector); + w[57] = __byte_perm (w[29], w[28], selector); + w[56] = __byte_perm (w[28], w[27], selector); + w[55] = __byte_perm (w[27], w[26], selector); + w[54] = __byte_perm (w[26], w[25], selector); + w[53] = __byte_perm (w[25], w[24], selector); + w[52] = __byte_perm (w[24], w[23], selector); + w[51] = __byte_perm (w[23], w[22], selector); + w[50] = __byte_perm (w[22], w[21], selector); + w[49] = __byte_perm (w[21], w[20], selector); + w[48] = __byte_perm (w[20], w[19], selector); + w[47] = __byte_perm (w[19], w[18], selector); + w[46] = __byte_perm (w[18], w[17], selector); + w[45] = __byte_perm (w[17], w[16], selector); + w[44] = __byte_perm (w[16], w[15], selector); + w[43] = __byte_perm (w[15], w[14], selector); + w[42] = __byte_perm (w[14], w[13], selector); + w[41] = __byte_perm (w[13], w[12], selector); + w[40] = __byte_perm (w[12], w[11], selector); + w[39] = __byte_perm (w[11], w[10], selector); + w[38] = __byte_perm (w[10], w[ 9], selector); + w[37] = __byte_perm (w[ 9], w[ 8], selector); + w[36] = __byte_perm (w[ 8], w[ 7], selector); + w[35] = __byte_perm (w[ 7], w[ 6], selector); + w[34] = __byte_perm (w[ 6], w[ 5], selector); + w[33] = __byte_perm (w[ 5], w[ 4], selector); + w[32] = __byte_perm (w[ 4], w[ 3], selector); + w[31] = __byte_perm (w[ 3], w[ 2], selector); + w[30] = __byte_perm (w[ 2], w[ 1], selector); + w[29] = __byte_perm (w[ 1], w[ 0], selector); + w[28] = __byte_perm (w[ 0], 0, selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = __byte_perm (w[34], w[33], selector); + w[62] = __byte_perm (w[33], w[32], selector); + w[61] = __byte_perm (w[32], w[31], selector); + w[60] = __byte_perm (w[31], w[30], selector); + w[59] = __byte_perm (w[30], w[29], selector); + w[58] = __byte_perm (w[29], w[28], selector); + w[57] = __byte_perm (w[28], w[27], selector); + w[56] = __byte_perm (w[27], w[26], selector); + w[55] = __byte_perm (w[26], w[25], selector); + w[54] = __byte_perm (w[25], w[24], selector); + w[53] = __byte_perm (w[24], w[23], selector); + w[52] = __byte_perm (w[23], w[22], selector); + w[51] = __byte_perm (w[22], w[21], selector); + w[50] = __byte_perm (w[21], w[20], selector); + w[49] = __byte_perm (w[20], w[19], selector); + w[48] = __byte_perm (w[19], w[18], selector); + w[47] = __byte_perm (w[18], w[17], selector); + w[46] = __byte_perm (w[17], w[16], selector); + w[45] = __byte_perm (w[16], w[15], selector); + w[44] = __byte_perm (w[15], w[14], selector); + w[43] = __byte_perm (w[14], w[13], selector); + w[42] = __byte_perm (w[13], w[12], selector); + w[41] = __byte_perm (w[12], w[11], selector); + w[40] = __byte_perm (w[11], w[10], selector); + w[39] = __byte_perm (w[10], w[ 9], selector); + w[38] = __byte_perm (w[ 9], w[ 8], selector); + w[37] = __byte_perm (w[ 8], w[ 7], selector); + w[36] = __byte_perm (w[ 7], w[ 6], selector); + w[35] = __byte_perm (w[ 6], w[ 5], selector); + w[34] = __byte_perm (w[ 5], w[ 4], selector); + w[33] = __byte_perm (w[ 4], w[ 3], selector); + w[32] = __byte_perm (w[ 3], w[ 2], selector); + w[31] = __byte_perm (w[ 2], w[ 1], selector); + w[30] = __byte_perm (w[ 1], w[ 0], selector); + w[29] = __byte_perm (w[ 0], 0, selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = __byte_perm (w[33], w[32], selector); + w[62] = __byte_perm (w[32], w[31], selector); + w[61] = __byte_perm (w[31], w[30], selector); + w[60] = __byte_perm (w[30], w[29], selector); + w[59] = __byte_perm (w[29], w[28], selector); + w[58] = __byte_perm (w[28], w[27], selector); + w[57] = __byte_perm (w[27], w[26], selector); + w[56] = __byte_perm (w[26], w[25], selector); + w[55] = __byte_perm (w[25], w[24], selector); + w[54] = __byte_perm (w[24], w[23], selector); + w[53] = __byte_perm (w[23], w[22], selector); + w[52] = __byte_perm (w[22], w[21], selector); + w[51] = __byte_perm (w[21], w[20], selector); + w[50] = __byte_perm (w[20], w[19], selector); + w[49] = __byte_perm (w[19], w[18], selector); + w[48] = __byte_perm (w[18], w[17], selector); + w[47] = __byte_perm (w[17], w[16], selector); + w[46] = __byte_perm (w[16], w[15], selector); + w[45] = __byte_perm (w[15], w[14], selector); + w[44] = __byte_perm (w[14], w[13], selector); + w[43] = __byte_perm (w[13], w[12], selector); + w[42] = __byte_perm (w[12], w[11], selector); + w[41] = __byte_perm (w[11], w[10], selector); + w[40] = __byte_perm (w[10], w[ 9], selector); + w[39] = __byte_perm (w[ 9], w[ 8], selector); + w[38] = __byte_perm (w[ 8], w[ 7], selector); + w[37] = __byte_perm (w[ 7], w[ 6], selector); + w[36] = __byte_perm (w[ 6], w[ 5], selector); + w[35] = __byte_perm (w[ 5], w[ 4], selector); + w[34] = __byte_perm (w[ 4], w[ 3], selector); + w[33] = __byte_perm (w[ 3], w[ 2], selector); + w[32] = __byte_perm (w[ 2], w[ 1], selector); + w[31] = __byte_perm (w[ 1], w[ 0], selector); + w[30] = __byte_perm (w[ 0], 0, selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = __byte_perm (w[32], w[31], selector); + w[62] = __byte_perm (w[31], w[30], selector); + w[61] = __byte_perm (w[30], w[29], selector); + w[60] = __byte_perm (w[29], w[28], selector); + w[59] = __byte_perm (w[28], w[27], selector); + w[58] = __byte_perm (w[27], w[26], selector); + w[57] = __byte_perm (w[26], w[25], selector); + w[56] = __byte_perm (w[25], w[24], selector); + w[55] = __byte_perm (w[24], w[23], selector); + w[54] = __byte_perm (w[23], w[22], selector); + w[53] = __byte_perm (w[22], w[21], selector); + w[52] = __byte_perm (w[21], w[20], selector); + w[51] = __byte_perm (w[20], w[19], selector); + w[50] = __byte_perm (w[19], w[18], selector); + w[49] = __byte_perm (w[18], w[17], selector); + w[48] = __byte_perm (w[17], w[16], selector); + w[47] = __byte_perm (w[16], w[15], selector); + w[46] = __byte_perm (w[15], w[14], selector); + w[45] = __byte_perm (w[14], w[13], selector); + w[44] = __byte_perm (w[13], w[12], selector); + w[43] = __byte_perm (w[12], w[11], selector); + w[42] = __byte_perm (w[11], w[10], selector); + w[41] = __byte_perm (w[10], w[ 9], selector); + w[40] = __byte_perm (w[ 9], w[ 8], selector); + w[39] = __byte_perm (w[ 8], w[ 7], selector); + w[38] = __byte_perm (w[ 7], w[ 6], selector); + w[37] = __byte_perm (w[ 6], w[ 5], selector); + w[36] = __byte_perm (w[ 5], w[ 4], selector); + w[35] = __byte_perm (w[ 4], w[ 3], selector); + w[34] = __byte_perm (w[ 3], w[ 2], selector); + w[33] = __byte_perm (w[ 2], w[ 1], selector); + w[32] = __byte_perm (w[ 1], w[ 0], selector); + w[31] = __byte_perm (w[ 0], 0, selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = __byte_perm (w[31], w[30], selector); + w[62] = __byte_perm (w[30], w[29], selector); + w[61] = __byte_perm (w[29], w[28], selector); + w[60] = __byte_perm (w[28], w[27], selector); + w[59] = __byte_perm (w[27], w[26], selector); + w[58] = __byte_perm (w[26], w[25], selector); + w[57] = __byte_perm (w[25], w[24], selector); + w[56] = __byte_perm (w[24], w[23], selector); + w[55] = __byte_perm (w[23], w[22], selector); + w[54] = __byte_perm (w[22], w[21], selector); + w[53] = __byte_perm (w[21], w[20], selector); + w[52] = __byte_perm (w[20], w[19], selector); + w[51] = __byte_perm (w[19], w[18], selector); + w[50] = __byte_perm (w[18], w[17], selector); + w[49] = __byte_perm (w[17], w[16], selector); + w[48] = __byte_perm (w[16], w[15], selector); + w[47] = __byte_perm (w[15], w[14], selector); + w[46] = __byte_perm (w[14], w[13], selector); + w[45] = __byte_perm (w[13], w[12], selector); + w[44] = __byte_perm (w[12], w[11], selector); + w[43] = __byte_perm (w[11], w[10], selector); + w[42] = __byte_perm (w[10], w[ 9], selector); + w[41] = __byte_perm (w[ 9], w[ 8], selector); + w[40] = __byte_perm (w[ 8], w[ 7], selector); + w[39] = __byte_perm (w[ 7], w[ 6], selector); + w[38] = __byte_perm (w[ 6], w[ 5], selector); + w[37] = __byte_perm (w[ 5], w[ 4], selector); + w[36] = __byte_perm (w[ 4], w[ 3], selector); + w[35] = __byte_perm (w[ 3], w[ 2], selector); + w[34] = __byte_perm (w[ 2], w[ 1], selector); + w[33] = __byte_perm (w[ 1], w[ 0], selector); + w[32] = __byte_perm (w[ 0], 0, selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = __byte_perm (w[30], w[29], selector); + w[62] = __byte_perm (w[29], w[28], selector); + w[61] = __byte_perm (w[28], w[27], selector); + w[60] = __byte_perm (w[27], w[26], selector); + w[59] = __byte_perm (w[26], w[25], selector); + w[58] = __byte_perm (w[25], w[24], selector); + w[57] = __byte_perm (w[24], w[23], selector); + w[56] = __byte_perm (w[23], w[22], selector); + w[55] = __byte_perm (w[22], w[21], selector); + w[54] = __byte_perm (w[21], w[20], selector); + w[53] = __byte_perm (w[20], w[19], selector); + w[52] = __byte_perm (w[19], w[18], selector); + w[51] = __byte_perm (w[18], w[17], selector); + w[50] = __byte_perm (w[17], w[16], selector); + w[49] = __byte_perm (w[16], w[15], selector); + w[48] = __byte_perm (w[15], w[14], selector); + w[47] = __byte_perm (w[14], w[13], selector); + w[46] = __byte_perm (w[13], w[12], selector); + w[45] = __byte_perm (w[12], w[11], selector); + w[44] = __byte_perm (w[11], w[10], selector); + w[43] = __byte_perm (w[10], w[ 9], selector); + w[42] = __byte_perm (w[ 9], w[ 8], selector); + w[41] = __byte_perm (w[ 8], w[ 7], selector); + w[40] = __byte_perm (w[ 7], w[ 6], selector); + w[39] = __byte_perm (w[ 6], w[ 5], selector); + w[38] = __byte_perm (w[ 5], w[ 4], selector); + w[37] = __byte_perm (w[ 4], w[ 3], selector); + w[36] = __byte_perm (w[ 3], w[ 2], selector); + w[35] = __byte_perm (w[ 2], w[ 1], selector); + w[34] = __byte_perm (w[ 1], w[ 0], selector); + w[33] = __byte_perm (w[ 0], 0, selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = __byte_perm (w[29], w[28], selector); + w[62] = __byte_perm (w[28], w[27], selector); + w[61] = __byte_perm (w[27], w[26], selector); + w[60] = __byte_perm (w[26], w[25], selector); + w[59] = __byte_perm (w[25], w[24], selector); + w[58] = __byte_perm (w[24], w[23], selector); + w[57] = __byte_perm (w[23], w[22], selector); + w[56] = __byte_perm (w[22], w[21], selector); + w[55] = __byte_perm (w[21], w[20], selector); + w[54] = __byte_perm (w[20], w[19], selector); + w[53] = __byte_perm (w[19], w[18], selector); + w[52] = __byte_perm (w[18], w[17], selector); + w[51] = __byte_perm (w[17], w[16], selector); + w[50] = __byte_perm (w[16], w[15], selector); + w[49] = __byte_perm (w[15], w[14], selector); + w[48] = __byte_perm (w[14], w[13], selector); + w[47] = __byte_perm (w[13], w[12], selector); + w[46] = __byte_perm (w[12], w[11], selector); + w[45] = __byte_perm (w[11], w[10], selector); + w[44] = __byte_perm (w[10], w[ 9], selector); + w[43] = __byte_perm (w[ 9], w[ 8], selector); + w[42] = __byte_perm (w[ 8], w[ 7], selector); + w[41] = __byte_perm (w[ 7], w[ 6], selector); + w[40] = __byte_perm (w[ 6], w[ 5], selector); + w[39] = __byte_perm (w[ 5], w[ 4], selector); + w[38] = __byte_perm (w[ 4], w[ 3], selector); + w[37] = __byte_perm (w[ 3], w[ 2], selector); + w[36] = __byte_perm (w[ 2], w[ 1], selector); + w[35] = __byte_perm (w[ 1], w[ 0], selector); + w[34] = __byte_perm (w[ 0], 0, selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = __byte_perm (w[28], w[27], selector); + w[62] = __byte_perm (w[27], w[26], selector); + w[61] = __byte_perm (w[26], w[25], selector); + w[60] = __byte_perm (w[25], w[24], selector); + w[59] = __byte_perm (w[24], w[23], selector); + w[58] = __byte_perm (w[23], w[22], selector); + w[57] = __byte_perm (w[22], w[21], selector); + w[56] = __byte_perm (w[21], w[20], selector); + w[55] = __byte_perm (w[20], w[19], selector); + w[54] = __byte_perm (w[19], w[18], selector); + w[53] = __byte_perm (w[18], w[17], selector); + w[52] = __byte_perm (w[17], w[16], selector); + w[51] = __byte_perm (w[16], w[15], selector); + w[50] = __byte_perm (w[15], w[14], selector); + w[49] = __byte_perm (w[14], w[13], selector); + w[48] = __byte_perm (w[13], w[12], selector); + w[47] = __byte_perm (w[12], w[11], selector); + w[46] = __byte_perm (w[11], w[10], selector); + w[45] = __byte_perm (w[10], w[ 9], selector); + w[44] = __byte_perm (w[ 9], w[ 8], selector); + w[43] = __byte_perm (w[ 8], w[ 7], selector); + w[42] = __byte_perm (w[ 7], w[ 6], selector); + w[41] = __byte_perm (w[ 6], w[ 5], selector); + w[40] = __byte_perm (w[ 5], w[ 4], selector); + w[39] = __byte_perm (w[ 4], w[ 3], selector); + w[38] = __byte_perm (w[ 3], w[ 2], selector); + w[37] = __byte_perm (w[ 2], w[ 1], selector); + w[36] = __byte_perm (w[ 1], w[ 0], selector); + w[35] = __byte_perm (w[ 0], 0, selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = __byte_perm (w[27], w[26], selector); + w[62] = __byte_perm (w[26], w[25], selector); + w[61] = __byte_perm (w[25], w[24], selector); + w[60] = __byte_perm (w[24], w[23], selector); + w[59] = __byte_perm (w[23], w[22], selector); + w[58] = __byte_perm (w[22], w[21], selector); + w[57] = __byte_perm (w[21], w[20], selector); + w[56] = __byte_perm (w[20], w[19], selector); + w[55] = __byte_perm (w[19], w[18], selector); + w[54] = __byte_perm (w[18], w[17], selector); + w[53] = __byte_perm (w[17], w[16], selector); + w[52] = __byte_perm (w[16], w[15], selector); + w[51] = __byte_perm (w[15], w[14], selector); + w[50] = __byte_perm (w[14], w[13], selector); + w[49] = __byte_perm (w[13], w[12], selector); + w[48] = __byte_perm (w[12], w[11], selector); + w[47] = __byte_perm (w[11], w[10], selector); + w[46] = __byte_perm (w[10], w[ 9], selector); + w[45] = __byte_perm (w[ 9], w[ 8], selector); + w[44] = __byte_perm (w[ 8], w[ 7], selector); + w[43] = __byte_perm (w[ 7], w[ 6], selector); + w[42] = __byte_perm (w[ 6], w[ 5], selector); + w[41] = __byte_perm (w[ 5], w[ 4], selector); + w[40] = __byte_perm (w[ 4], w[ 3], selector); + w[39] = __byte_perm (w[ 3], w[ 2], selector); + w[38] = __byte_perm (w[ 2], w[ 1], selector); + w[37] = __byte_perm (w[ 1], w[ 0], selector); + w[36] = __byte_perm (w[ 0], 0, selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = __byte_perm (w[26], w[25], selector); + w[62] = __byte_perm (w[25], w[24], selector); + w[61] = __byte_perm (w[24], w[23], selector); + w[60] = __byte_perm (w[23], w[22], selector); + w[59] = __byte_perm (w[22], w[21], selector); + w[58] = __byte_perm (w[21], w[20], selector); + w[57] = __byte_perm (w[20], w[19], selector); + w[56] = __byte_perm (w[19], w[18], selector); + w[55] = __byte_perm (w[18], w[17], selector); + w[54] = __byte_perm (w[17], w[16], selector); + w[53] = __byte_perm (w[16], w[15], selector); + w[52] = __byte_perm (w[15], w[14], selector); + w[51] = __byte_perm (w[14], w[13], selector); + w[50] = __byte_perm (w[13], w[12], selector); + w[49] = __byte_perm (w[12], w[11], selector); + w[48] = __byte_perm (w[11], w[10], selector); + w[47] = __byte_perm (w[10], w[ 9], selector); + w[46] = __byte_perm (w[ 9], w[ 8], selector); + w[45] = __byte_perm (w[ 8], w[ 7], selector); + w[44] = __byte_perm (w[ 7], w[ 6], selector); + w[43] = __byte_perm (w[ 6], w[ 5], selector); + w[42] = __byte_perm (w[ 5], w[ 4], selector); + w[41] = __byte_perm (w[ 4], w[ 3], selector); + w[40] = __byte_perm (w[ 3], w[ 2], selector); + w[39] = __byte_perm (w[ 2], w[ 1], selector); + w[38] = __byte_perm (w[ 1], w[ 0], selector); + w[37] = __byte_perm (w[ 0], 0, selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = __byte_perm (w[25], w[24], selector); + w[62] = __byte_perm (w[24], w[23], selector); + w[61] = __byte_perm (w[23], w[22], selector); + w[60] = __byte_perm (w[22], w[21], selector); + w[59] = __byte_perm (w[21], w[20], selector); + w[58] = __byte_perm (w[20], w[19], selector); + w[57] = __byte_perm (w[19], w[18], selector); + w[56] = __byte_perm (w[18], w[17], selector); + w[55] = __byte_perm (w[17], w[16], selector); + w[54] = __byte_perm (w[16], w[15], selector); + w[53] = __byte_perm (w[15], w[14], selector); + w[52] = __byte_perm (w[14], w[13], selector); + w[51] = __byte_perm (w[13], w[12], selector); + w[50] = __byte_perm (w[12], w[11], selector); + w[49] = __byte_perm (w[11], w[10], selector); + w[48] = __byte_perm (w[10], w[ 9], selector); + w[47] = __byte_perm (w[ 9], w[ 8], selector); + w[46] = __byte_perm (w[ 8], w[ 7], selector); + w[45] = __byte_perm (w[ 7], w[ 6], selector); + w[44] = __byte_perm (w[ 6], w[ 5], selector); + w[43] = __byte_perm (w[ 5], w[ 4], selector); + w[42] = __byte_perm (w[ 4], w[ 3], selector); + w[41] = __byte_perm (w[ 3], w[ 2], selector); + w[40] = __byte_perm (w[ 2], w[ 1], selector); + w[39] = __byte_perm (w[ 1], w[ 0], selector); + w[38] = __byte_perm (w[ 0], 0, selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = __byte_perm (w[24], w[23], selector); + w[62] = __byte_perm (w[23], w[22], selector); + w[61] = __byte_perm (w[22], w[21], selector); + w[60] = __byte_perm (w[21], w[20], selector); + w[59] = __byte_perm (w[20], w[19], selector); + w[58] = __byte_perm (w[19], w[18], selector); + w[57] = __byte_perm (w[18], w[17], selector); + w[56] = __byte_perm (w[17], w[16], selector); + w[55] = __byte_perm (w[16], w[15], selector); + w[54] = __byte_perm (w[15], w[14], selector); + w[53] = __byte_perm (w[14], w[13], selector); + w[52] = __byte_perm (w[13], w[12], selector); + w[51] = __byte_perm (w[12], w[11], selector); + w[50] = __byte_perm (w[11], w[10], selector); + w[49] = __byte_perm (w[10], w[ 9], selector); + w[48] = __byte_perm (w[ 9], w[ 8], selector); + w[47] = __byte_perm (w[ 8], w[ 7], selector); + w[46] = __byte_perm (w[ 7], w[ 6], selector); + w[45] = __byte_perm (w[ 6], w[ 5], selector); + w[44] = __byte_perm (w[ 5], w[ 4], selector); + w[43] = __byte_perm (w[ 4], w[ 3], selector); + w[42] = __byte_perm (w[ 3], w[ 2], selector); + w[41] = __byte_perm (w[ 2], w[ 1], selector); + w[40] = __byte_perm (w[ 1], w[ 0], selector); + w[39] = __byte_perm (w[ 0], 0, selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = __byte_perm (w[23], w[22], selector); + w[62] = __byte_perm (w[22], w[21], selector); + w[61] = __byte_perm (w[21], w[20], selector); + w[60] = __byte_perm (w[20], w[19], selector); + w[59] = __byte_perm (w[19], w[18], selector); + w[58] = __byte_perm (w[18], w[17], selector); + w[57] = __byte_perm (w[17], w[16], selector); + w[56] = __byte_perm (w[16], w[15], selector); + w[55] = __byte_perm (w[15], w[14], selector); + w[54] = __byte_perm (w[14], w[13], selector); + w[53] = __byte_perm (w[13], w[12], selector); + w[52] = __byte_perm (w[12], w[11], selector); + w[51] = __byte_perm (w[11], w[10], selector); + w[50] = __byte_perm (w[10], w[ 9], selector); + w[49] = __byte_perm (w[ 9], w[ 8], selector); + w[48] = __byte_perm (w[ 8], w[ 7], selector); + w[47] = __byte_perm (w[ 7], w[ 6], selector); + w[46] = __byte_perm (w[ 6], w[ 5], selector); + w[45] = __byte_perm (w[ 5], w[ 4], selector); + w[44] = __byte_perm (w[ 4], w[ 3], selector); + w[43] = __byte_perm (w[ 3], w[ 2], selector); + w[42] = __byte_perm (w[ 2], w[ 1], selector); + w[41] = __byte_perm (w[ 1], w[ 0], selector); + w[40] = __byte_perm (w[ 0], 0, selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = __byte_perm (w[22], w[21], selector); + w[62] = __byte_perm (w[21], w[20], selector); + w[61] = __byte_perm (w[20], w[19], selector); + w[60] = __byte_perm (w[19], w[18], selector); + w[59] = __byte_perm (w[18], w[17], selector); + w[58] = __byte_perm (w[17], w[16], selector); + w[57] = __byte_perm (w[16], w[15], selector); + w[56] = __byte_perm (w[15], w[14], selector); + w[55] = __byte_perm (w[14], w[13], selector); + w[54] = __byte_perm (w[13], w[12], selector); + w[53] = __byte_perm (w[12], w[11], selector); + w[52] = __byte_perm (w[11], w[10], selector); + w[51] = __byte_perm (w[10], w[ 9], selector); + w[50] = __byte_perm (w[ 9], w[ 8], selector); + w[49] = __byte_perm (w[ 8], w[ 7], selector); + w[48] = __byte_perm (w[ 7], w[ 6], selector); + w[47] = __byte_perm (w[ 6], w[ 5], selector); + w[46] = __byte_perm (w[ 5], w[ 4], selector); + w[45] = __byte_perm (w[ 4], w[ 3], selector); + w[44] = __byte_perm (w[ 3], w[ 2], selector); + w[43] = __byte_perm (w[ 2], w[ 1], selector); + w[42] = __byte_perm (w[ 1], w[ 0], selector); + w[41] = __byte_perm (w[ 0], 0, selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = __byte_perm (w[21], w[20], selector); + w[62] = __byte_perm (w[20], w[19], selector); + w[61] = __byte_perm (w[19], w[18], selector); + w[60] = __byte_perm (w[18], w[17], selector); + w[59] = __byte_perm (w[17], w[16], selector); + w[58] = __byte_perm (w[16], w[15], selector); + w[57] = __byte_perm (w[15], w[14], selector); + w[56] = __byte_perm (w[14], w[13], selector); + w[55] = __byte_perm (w[13], w[12], selector); + w[54] = __byte_perm (w[12], w[11], selector); + w[53] = __byte_perm (w[11], w[10], selector); + w[52] = __byte_perm (w[10], w[ 9], selector); + w[51] = __byte_perm (w[ 9], w[ 8], selector); + w[50] = __byte_perm (w[ 8], w[ 7], selector); + w[49] = __byte_perm (w[ 7], w[ 6], selector); + w[48] = __byte_perm (w[ 6], w[ 5], selector); + w[47] = __byte_perm (w[ 5], w[ 4], selector); + w[46] = __byte_perm (w[ 4], w[ 3], selector); + w[45] = __byte_perm (w[ 3], w[ 2], selector); + w[44] = __byte_perm (w[ 2], w[ 1], selector); + w[43] = __byte_perm (w[ 1], w[ 0], selector); + w[42] = __byte_perm (w[ 0], 0, selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = __byte_perm (w[20], w[19], selector); + w[62] = __byte_perm (w[19], w[18], selector); + w[61] = __byte_perm (w[18], w[17], selector); + w[60] = __byte_perm (w[17], w[16], selector); + w[59] = __byte_perm (w[16], w[15], selector); + w[58] = __byte_perm (w[15], w[14], selector); + w[57] = __byte_perm (w[14], w[13], selector); + w[56] = __byte_perm (w[13], w[12], selector); + w[55] = __byte_perm (w[12], w[11], selector); + w[54] = __byte_perm (w[11], w[10], selector); + w[53] = __byte_perm (w[10], w[ 9], selector); + w[52] = __byte_perm (w[ 9], w[ 8], selector); + w[51] = __byte_perm (w[ 8], w[ 7], selector); + w[50] = __byte_perm (w[ 7], w[ 6], selector); + w[49] = __byte_perm (w[ 6], w[ 5], selector); + w[48] = __byte_perm (w[ 5], w[ 4], selector); + w[47] = __byte_perm (w[ 4], w[ 3], selector); + w[46] = __byte_perm (w[ 3], w[ 2], selector); + w[45] = __byte_perm (w[ 2], w[ 1], selector); + w[44] = __byte_perm (w[ 1], w[ 0], selector); + w[43] = __byte_perm (w[ 0], 0, selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = __byte_perm (w[19], w[18], selector); + w[62] = __byte_perm (w[18], w[17], selector); + w[61] = __byte_perm (w[17], w[16], selector); + w[60] = __byte_perm (w[16], w[15], selector); + w[59] = __byte_perm (w[15], w[14], selector); + w[58] = __byte_perm (w[14], w[13], selector); + w[57] = __byte_perm (w[13], w[12], selector); + w[56] = __byte_perm (w[12], w[11], selector); + w[55] = __byte_perm (w[11], w[10], selector); + w[54] = __byte_perm (w[10], w[ 9], selector); + w[53] = __byte_perm (w[ 9], w[ 8], selector); + w[52] = __byte_perm (w[ 8], w[ 7], selector); + w[51] = __byte_perm (w[ 7], w[ 6], selector); + w[50] = __byte_perm (w[ 6], w[ 5], selector); + w[49] = __byte_perm (w[ 5], w[ 4], selector); + w[48] = __byte_perm (w[ 4], w[ 3], selector); + w[47] = __byte_perm (w[ 3], w[ 2], selector); + w[46] = __byte_perm (w[ 2], w[ 1], selector); + w[45] = __byte_perm (w[ 1], w[ 0], selector); + w[44] = __byte_perm (w[ 0], 0, selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = __byte_perm (w[18], w[17], selector); + w[62] = __byte_perm (w[17], w[16], selector); + w[61] = __byte_perm (w[16], w[15], selector); + w[60] = __byte_perm (w[15], w[14], selector); + w[59] = __byte_perm (w[14], w[13], selector); + w[58] = __byte_perm (w[13], w[12], selector); + w[57] = __byte_perm (w[12], w[11], selector); + w[56] = __byte_perm (w[11], w[10], selector); + w[55] = __byte_perm (w[10], w[ 9], selector); + w[54] = __byte_perm (w[ 9], w[ 8], selector); + w[53] = __byte_perm (w[ 8], w[ 7], selector); + w[52] = __byte_perm (w[ 7], w[ 6], selector); + w[51] = __byte_perm (w[ 6], w[ 5], selector); + w[50] = __byte_perm (w[ 5], w[ 4], selector); + w[49] = __byte_perm (w[ 4], w[ 3], selector); + w[48] = __byte_perm (w[ 3], w[ 2], selector); + w[47] = __byte_perm (w[ 2], w[ 1], selector); + w[46] = __byte_perm (w[ 1], w[ 0], selector); + w[45] = __byte_perm (w[ 0], 0, selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = __byte_perm (w[17], w[16], selector); + w[62] = __byte_perm (w[16], w[15], selector); + w[61] = __byte_perm (w[15], w[14], selector); + w[60] = __byte_perm (w[14], w[13], selector); + w[59] = __byte_perm (w[13], w[12], selector); + w[58] = __byte_perm (w[12], w[11], selector); + w[57] = __byte_perm (w[11], w[10], selector); + w[56] = __byte_perm (w[10], w[ 9], selector); + w[55] = __byte_perm (w[ 9], w[ 8], selector); + w[54] = __byte_perm (w[ 8], w[ 7], selector); + w[53] = __byte_perm (w[ 7], w[ 6], selector); + w[52] = __byte_perm (w[ 6], w[ 5], selector); + w[51] = __byte_perm (w[ 5], w[ 4], selector); + w[50] = __byte_perm (w[ 4], w[ 3], selector); + w[49] = __byte_perm (w[ 3], w[ 2], selector); + w[48] = __byte_perm (w[ 2], w[ 1], selector); + w[47] = __byte_perm (w[ 1], w[ 0], selector); + w[46] = __byte_perm (w[ 0], 0, selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = __byte_perm (w[16], w[15], selector); + w[62] = __byte_perm (w[15], w[14], selector); + w[61] = __byte_perm (w[14], w[13], selector); + w[60] = __byte_perm (w[13], w[12], selector); + w[59] = __byte_perm (w[12], w[11], selector); + w[58] = __byte_perm (w[11], w[10], selector); + w[57] = __byte_perm (w[10], w[ 9], selector); + w[56] = __byte_perm (w[ 9], w[ 8], selector); + w[55] = __byte_perm (w[ 8], w[ 7], selector); + w[54] = __byte_perm (w[ 7], w[ 6], selector); + w[53] = __byte_perm (w[ 6], w[ 5], selector); + w[52] = __byte_perm (w[ 5], w[ 4], selector); + w[51] = __byte_perm (w[ 4], w[ 3], selector); + w[50] = __byte_perm (w[ 3], w[ 2], selector); + w[49] = __byte_perm (w[ 2], w[ 1], selector); + w[48] = __byte_perm (w[ 1], w[ 0], selector); + w[47] = __byte_perm (w[ 0], 0, selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = __byte_perm (w[15], w[14], selector); + w[62] = __byte_perm (w[14], w[13], selector); + w[61] = __byte_perm (w[13], w[12], selector); + w[60] = __byte_perm (w[12], w[11], selector); + w[59] = __byte_perm (w[11], w[10], selector); + w[58] = __byte_perm (w[10], w[ 9], selector); + w[57] = __byte_perm (w[ 9], w[ 8], selector); + w[56] = __byte_perm (w[ 8], w[ 7], selector); + w[55] = __byte_perm (w[ 7], w[ 6], selector); + w[54] = __byte_perm (w[ 6], w[ 5], selector); + w[53] = __byte_perm (w[ 5], w[ 4], selector); + w[52] = __byte_perm (w[ 4], w[ 3], selector); + w[51] = __byte_perm (w[ 3], w[ 2], selector); + w[50] = __byte_perm (w[ 2], w[ 1], selector); + w[49] = __byte_perm (w[ 1], w[ 0], selector); + w[48] = __byte_perm (w[ 0], 0, selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = __byte_perm (w[14], w[13], selector); + w[62] = __byte_perm (w[13], w[12], selector); + w[61] = __byte_perm (w[12], w[11], selector); + w[60] = __byte_perm (w[11], w[10], selector); + w[59] = __byte_perm (w[10], w[ 9], selector); + w[58] = __byte_perm (w[ 9], w[ 8], selector); + w[57] = __byte_perm (w[ 8], w[ 7], selector); + w[56] = __byte_perm (w[ 7], w[ 6], selector); + w[55] = __byte_perm (w[ 6], w[ 5], selector); + w[54] = __byte_perm (w[ 5], w[ 4], selector); + w[53] = __byte_perm (w[ 4], w[ 3], selector); + w[52] = __byte_perm (w[ 3], w[ 2], selector); + w[51] = __byte_perm (w[ 2], w[ 1], selector); + w[50] = __byte_perm (w[ 1], w[ 0], selector); + w[49] = __byte_perm (w[ 0], 0, selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = __byte_perm (w[13], w[12], selector); + w[62] = __byte_perm (w[12], w[11], selector); + w[61] = __byte_perm (w[11], w[10], selector); + w[60] = __byte_perm (w[10], w[ 9], selector); + w[59] = __byte_perm (w[ 9], w[ 8], selector); + w[58] = __byte_perm (w[ 8], w[ 7], selector); + w[57] = __byte_perm (w[ 7], w[ 6], selector); + w[56] = __byte_perm (w[ 6], w[ 5], selector); + w[55] = __byte_perm (w[ 5], w[ 4], selector); + w[54] = __byte_perm (w[ 4], w[ 3], selector); + w[53] = __byte_perm (w[ 3], w[ 2], selector); + w[52] = __byte_perm (w[ 2], w[ 1], selector); + w[51] = __byte_perm (w[ 1], w[ 0], selector); + w[50] = __byte_perm (w[ 0], 0, selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = __byte_perm (w[12], w[11], selector); + w[62] = __byte_perm (w[11], w[10], selector); + w[61] = __byte_perm (w[10], w[ 9], selector); + w[60] = __byte_perm (w[ 9], w[ 8], selector); + w[59] = __byte_perm (w[ 8], w[ 7], selector); + w[58] = __byte_perm (w[ 7], w[ 6], selector); + w[57] = __byte_perm (w[ 6], w[ 5], selector); + w[56] = __byte_perm (w[ 5], w[ 4], selector); + w[55] = __byte_perm (w[ 4], w[ 3], selector); + w[54] = __byte_perm (w[ 3], w[ 2], selector); + w[53] = __byte_perm (w[ 2], w[ 1], selector); + w[52] = __byte_perm (w[ 1], w[ 0], selector); + w[51] = __byte_perm (w[ 0], 0, selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = __byte_perm (w[11], w[10], selector); + w[62] = __byte_perm (w[10], w[ 9], selector); + w[61] = __byte_perm (w[ 9], w[ 8], selector); + w[60] = __byte_perm (w[ 8], w[ 7], selector); + w[59] = __byte_perm (w[ 7], w[ 6], selector); + w[58] = __byte_perm (w[ 6], w[ 5], selector); + w[57] = __byte_perm (w[ 5], w[ 4], selector); + w[56] = __byte_perm (w[ 4], w[ 3], selector); + w[55] = __byte_perm (w[ 3], w[ 2], selector); + w[54] = __byte_perm (w[ 2], w[ 1], selector); + w[53] = __byte_perm (w[ 1], w[ 0], selector); + w[52] = __byte_perm (w[ 0], 0, selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = __byte_perm (w[10], w[ 9], selector); + w[62] = __byte_perm (w[ 9], w[ 8], selector); + w[61] = __byte_perm (w[ 8], w[ 7], selector); + w[60] = __byte_perm (w[ 7], w[ 6], selector); + w[59] = __byte_perm (w[ 6], w[ 5], selector); + w[58] = __byte_perm (w[ 5], w[ 4], selector); + w[57] = __byte_perm (w[ 4], w[ 3], selector); + w[56] = __byte_perm (w[ 3], w[ 2], selector); + w[55] = __byte_perm (w[ 2], w[ 1], selector); + w[54] = __byte_perm (w[ 1], w[ 0], selector); + w[53] = __byte_perm (w[ 0], 0, selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = __byte_perm (w[ 9], w[ 8], selector); + w[62] = __byte_perm (w[ 8], w[ 7], selector); + w[61] = __byte_perm (w[ 7], w[ 6], selector); + w[60] = __byte_perm (w[ 6], w[ 5], selector); + w[59] = __byte_perm (w[ 5], w[ 4], selector); + w[58] = __byte_perm (w[ 4], w[ 3], selector); + w[57] = __byte_perm (w[ 3], w[ 2], selector); + w[56] = __byte_perm (w[ 2], w[ 1], selector); + w[55] = __byte_perm (w[ 1], w[ 0], selector); + w[54] = __byte_perm (w[ 0], 0, selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = __byte_perm (w[ 8], w[ 7], selector); + w[62] = __byte_perm (w[ 7], w[ 6], selector); + w[61] = __byte_perm (w[ 6], w[ 5], selector); + w[60] = __byte_perm (w[ 5], w[ 4], selector); + w[59] = __byte_perm (w[ 4], w[ 3], selector); + w[58] = __byte_perm (w[ 3], w[ 2], selector); + w[57] = __byte_perm (w[ 2], w[ 1], selector); + w[56] = __byte_perm (w[ 1], w[ 0], selector); + w[55] = __byte_perm (w[ 0], 0, selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = __byte_perm (w[ 7], w[ 6], selector); + w[62] = __byte_perm (w[ 6], w[ 5], selector); + w[61] = __byte_perm (w[ 5], w[ 4], selector); + w[60] = __byte_perm (w[ 4], w[ 3], selector); + w[59] = __byte_perm (w[ 3], w[ 2], selector); + w[58] = __byte_perm (w[ 2], w[ 1], selector); + w[57] = __byte_perm (w[ 1], w[ 0], selector); + w[56] = __byte_perm (w[ 0], 0, selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = __byte_perm (w[ 6], w[ 5], selector); + w[62] = __byte_perm (w[ 5], w[ 4], selector); + w[61] = __byte_perm (w[ 4], w[ 3], selector); + w[60] = __byte_perm (w[ 3], w[ 2], selector); + w[59] = __byte_perm (w[ 2], w[ 1], selector); + w[58] = __byte_perm (w[ 1], w[ 0], selector); + w[57] = __byte_perm (w[ 0], 0, selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = __byte_perm (w[ 5], w[ 4], selector); + w[62] = __byte_perm (w[ 4], w[ 3], selector); + w[61] = __byte_perm (w[ 3], w[ 2], selector); + w[60] = __byte_perm (w[ 2], w[ 1], selector); + w[59] = __byte_perm (w[ 1], w[ 0], selector); + w[58] = __byte_perm (w[ 0], 0, selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = __byte_perm (w[ 4], w[ 3], selector); + w[62] = __byte_perm (w[ 3], w[ 2], selector); + w[61] = __byte_perm (w[ 2], w[ 1], selector); + w[60] = __byte_perm (w[ 1], w[ 0], selector); + w[59] = __byte_perm (w[ 0], 0, selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = __byte_perm (w[ 3], w[ 2], selector); + w[62] = __byte_perm (w[ 2], w[ 1], selector); + w[61] = __byte_perm (w[ 1], w[ 0], selector); + w[60] = __byte_perm (w[ 0], 0, selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = __byte_perm (w[ 2], w[ 1], selector); + w[62] = __byte_perm (w[ 1], w[ 0], selector); + w[61] = __byte_perm (w[ 0], 0, selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = __byte_perm (w[ 1], w[ 0], selector); + w[62] = __byte_perm (w[ 0], 0, selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = __byte_perm (w[ 0], 0, selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif +} + +void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) { #if defined cl_amd_media_ops switch (salt_len) @@ -13678,7 +31658,7 @@ inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) #endif } -inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) +void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) { // would be nice to have optimization based on amd_bytealign as with _le counterpart @@ -13775,7 +31755,7 @@ inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) } } -inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) { #if defined cl_amd_media_ops switch (salt_len) @@ -14140,7 +32120,7 @@ inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], #endif } -inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) { // would be nice to have optimization based on amd_bytealign as with _le counterpart @@ -14329,7 +32309,7 @@ inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], * vector functions as scalar (for outer loop usage) */ -inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 tmp = 0x01 << ((offset & 3) * 8); @@ -14343,7 +32323,7 @@ inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) +void append_0x80_1x4_S (u32 w0[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14353,7 +32333,7 @@ inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) w0[3] |= (offset >= 12) ? tmp : 0; } -inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14367,7 +32347,7 @@ inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14385,7 +32365,7 @@ inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset w2[3] |= (offset >= 44) ? tmp : 0; } -inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14407,7 +32387,7 @@ inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const w3[3] |= (offset >= 60) ? tmp : 0; } -inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { switch (offset) { @@ -14925,7 +32905,7 @@ inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w } } -inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x3727); @@ -14950,7 +32930,7 @@ inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x1707); @@ -14975,7 +32955,7 @@ inline void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x7372); @@ -15000,7 +32980,7 @@ inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) +void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #ifdef IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x4602); @@ -15021,7 +33001,7 @@ inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) #endif } -inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) +void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #ifdef IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); @@ -15042,251 +33022,163 @@ inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) #endif } -inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); switch (offset / 4) { case 0: - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 2: - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 3: - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 4: - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 5: - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 6: - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -15294,32 +33186,18 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 7: - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -15328,30 +33206,17 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 8: - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -15361,28 +33226,16 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 9: - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -15393,26 +33246,15 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 10: - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -15424,24 +33266,14 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 11: - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -15454,22 +33286,13 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 12: - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -15483,20 +33306,12 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 13: - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -15511,18 +33326,11 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 14: - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -15538,16 +33346,10 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 15: - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -15564,18 +33366,28 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - break; } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -15903,12 +33715,525 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w #endif } -inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + + switch (offset / 4) + { + case 0: + c0[0] = amd_bytealign_S (w3[3], 0, offset); + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = amd_bytealign_S (w3[3], 0, offset); + c0[0] = amd_bytealign_S (w3[2], w3[3], offset); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = amd_bytealign_S (w3[3], 0, offset); + c0[1] = amd_bytealign_S (w3[2], w3[3], offset); + c0[0] = amd_bytealign_S (w3[1], w3[2], offset); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = amd_bytealign_S (w3[3], 0, offset); + c0[2] = amd_bytealign_S (w3[2], w3[3], offset); + c0[1] = amd_bytealign_S (w3[1], w3[2], offset); + c0[0] = amd_bytealign_S (w3[0], w3[1], offset); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = amd_bytealign_S (w3[3], 0, offset); + c0[3] = amd_bytealign_S (w3[2], w3[3], offset); + c0[2] = amd_bytealign_S (w3[1], w3[2], offset); + c0[1] = amd_bytealign_S (w3[0], w3[1], offset); + c0[0] = amd_bytealign_S (w2[3], w3[0], offset); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = amd_bytealign_S (w3[3], 0, offset); + c1[0] = amd_bytealign_S (w3[2], w3[3], offset); + c0[3] = amd_bytealign_S (w3[1], w3[2], offset); + c0[2] = amd_bytealign_S (w3[0], w3[1], offset); + c0[1] = amd_bytealign_S (w2[3], w3[0], offset); + c0[0] = amd_bytealign_S (w2[2], w2[3], offset); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = amd_bytealign_S (w3[3], 0, offset); + c1[1] = amd_bytealign_S (w3[2], w3[3], offset); + c1[0] = amd_bytealign_S (w3[1], w3[2], offset); + c0[3] = amd_bytealign_S (w3[0], w3[1], offset); + c0[2] = amd_bytealign_S (w2[3], w3[0], offset); + c0[1] = amd_bytealign_S (w2[2], w2[3], offset); + c0[0] = amd_bytealign_S (w2[1], w2[2], offset); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = amd_bytealign_S (w3[3], 0, offset); + c1[2] = amd_bytealign_S (w3[2], w3[3], offset); + c1[1] = amd_bytealign_S (w3[1], w3[2], offset); + c1[0] = amd_bytealign_S (w3[0], w3[1], offset); + c0[3] = amd_bytealign_S (w2[3], w3[0], offset); + c0[2] = amd_bytealign_S (w2[2], w2[3], offset); + c0[1] = amd_bytealign_S (w2[1], w2[2], offset); + c0[0] = amd_bytealign_S (w2[0], w2[1], offset); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = amd_bytealign_S (w3[3], 0, offset); + c1[3] = amd_bytealign_S (w3[2], w3[3], offset); + c1[2] = amd_bytealign_S (w3[1], w3[2], offset); + c1[1] = amd_bytealign_S (w3[0], w3[1], offset); + c1[0] = amd_bytealign_S (w2[3], w3[0], offset); + c0[3] = amd_bytealign_S (w2[2], w2[3], offset); + c0[2] = amd_bytealign_S (w2[1], w2[2], offset); + c0[1] = amd_bytealign_S (w2[0], w2[1], offset); + c0[0] = amd_bytealign_S (w1[3], w2[0], offset); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = amd_bytealign_S (w3[3], 0, offset); + c2[0] = amd_bytealign_S (w3[2], w3[3], offset); + c1[3] = amd_bytealign_S (w3[1], w3[2], offset); + c1[2] = amd_bytealign_S (w3[0], w3[1], offset); + c1[1] = amd_bytealign_S (w2[3], w3[0], offset); + c1[0] = amd_bytealign_S (w2[2], w2[3], offset); + c0[3] = amd_bytealign_S (w2[1], w2[2], offset); + c0[2] = amd_bytealign_S (w2[0], w2[1], offset); + c0[1] = amd_bytealign_S (w1[3], w2[0], offset); + c0[0] = amd_bytealign_S (w1[2], w1[3], offset); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = amd_bytealign_S (w3[3], 0, offset); + c2[1] = amd_bytealign_S (w3[2], w3[3], offset); + c2[0] = amd_bytealign_S (w3[1], w3[2], offset); + c1[3] = amd_bytealign_S (w3[0], w3[1], offset); + c1[2] = amd_bytealign_S (w2[3], w3[0], offset); + c1[1] = amd_bytealign_S (w2[2], w2[3], offset); + c1[0] = amd_bytealign_S (w2[1], w2[2], offset); + c0[3] = amd_bytealign_S (w2[0], w2[1], offset); + c0[2] = amd_bytealign_S (w1[3], w2[0], offset); + c0[1] = amd_bytealign_S (w1[2], w1[3], offset); + c0[0] = amd_bytealign_S (w1[1], w1[2], offset); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = amd_bytealign_S (w3[3], 0, offset); + c2[2] = amd_bytealign_S (w3[2], w3[3], offset); + c2[1] = amd_bytealign_S (w3[1], w3[2], offset); + c2[0] = amd_bytealign_S (w3[0], w3[1], offset); + c1[3] = amd_bytealign_S (w2[3], w3[0], offset); + c1[2] = amd_bytealign_S (w2[2], w2[3], offset); + c1[1] = amd_bytealign_S (w2[1], w2[2], offset); + c1[0] = amd_bytealign_S (w2[0], w2[1], offset); + c0[3] = amd_bytealign_S (w1[3], w2[0], offset); + c0[2] = amd_bytealign_S (w1[2], w1[3], offset); + c0[1] = amd_bytealign_S (w1[1], w1[2], offset); + c0[0] = amd_bytealign_S (w1[0], w1[1], offset); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = amd_bytealign_S (w3[3], 0, offset); + c2[3] = amd_bytealign_S (w3[2], w3[3], offset); + c2[2] = amd_bytealign_S (w3[1], w3[2], offset); + c2[1] = amd_bytealign_S (w3[0], w3[1], offset); + c2[0] = amd_bytealign_S (w2[3], w3[0], offset); + c1[3] = amd_bytealign_S (w2[2], w2[3], offset); + c1[2] = amd_bytealign_S (w2[1], w2[2], offset); + c1[1] = amd_bytealign_S (w2[0], w2[1], offset); + c1[0] = amd_bytealign_S (w1[3], w2[0], offset); + c0[3] = amd_bytealign_S (w1[2], w1[3], offset); + c0[2] = amd_bytealign_S (w1[1], w1[2], offset); + c0[1] = amd_bytealign_S (w1[0], w1[1], offset); + c0[0] = amd_bytealign_S (w0[3], w1[0], offset); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = amd_bytealign_S (w3[3], 0, offset); + c3[0] = amd_bytealign_S (w3[2], w3[3], offset); + c2[3] = amd_bytealign_S (w3[1], w3[2], offset); + c2[2] = amd_bytealign_S (w3[0], w3[1], offset); + c2[1] = amd_bytealign_S (w2[3], w3[0], offset); + c2[0] = amd_bytealign_S (w2[2], w2[3], offset); + c1[3] = amd_bytealign_S (w2[1], w2[2], offset); + c1[2] = amd_bytealign_S (w2[0], w2[1], offset); + c1[1] = amd_bytealign_S (w1[3], w2[0], offset); + c1[0] = amd_bytealign_S (w1[2], w1[3], offset); + c0[3] = amd_bytealign_S (w1[1], w1[2], offset); + c0[2] = amd_bytealign_S (w1[0], w1[1], offset); + c0[1] = amd_bytealign_S (w0[3], w1[0], offset); + c0[0] = amd_bytealign_S (w0[2], w0[3], offset); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = amd_bytealign_S (w3[3], 0, offset); + c3[1] = amd_bytealign_S (w3[2], w3[3], offset); + c3[0] = amd_bytealign_S (w3[1], w3[2], offset); + c2[3] = amd_bytealign_S (w3[0], w3[1], offset); + c2[2] = amd_bytealign_S (w2[3], w3[0], offset); + c2[1] = amd_bytealign_S (w2[2], w2[3], offset); + c2[0] = amd_bytealign_S (w2[1], w2[2], offset); + c1[3] = amd_bytealign_S (w2[0], w2[1], offset); + c1[2] = amd_bytealign_S (w1[3], w2[0], offset); + c1[1] = amd_bytealign_S (w1[2], w1[3], offset); + c1[0] = amd_bytealign_S (w1[1], w1[2], offset); + c0[3] = amd_bytealign_S (w1[0], w1[1], offset); + c0[2] = amd_bytealign_S (w0[3], w1[0], offset); + c0[1] = amd_bytealign_S (w0[2], w0[3], offset); + c0[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = amd_bytealign_S (w3[3], 0, offset); + c3[2] = amd_bytealign_S (w3[2], w3[3], offset); + c3[1] = amd_bytealign_S (w3[1], w3[2], offset); + c3[0] = amd_bytealign_S (w3[0], w3[1], offset); + c2[3] = amd_bytealign_S (w2[3], w3[0], offset); + c2[2] = amd_bytealign_S (w2[2], w2[3], offset); + c2[1] = amd_bytealign_S (w2[1], w2[2], offset); + c2[0] = amd_bytealign_S (w2[0], w2[1], offset); + c1[3] = amd_bytealign_S (w1[3], w2[0], offset); + c1[2] = amd_bytealign_S (w1[2], w1[3], offset); + c1[1] = amd_bytealign_S (w1[1], w1[2], offset); + c1[0] = amd_bytealign_S (w1[0], w1[1], offset); + c0[3] = amd_bytealign_S (w0[3], w1[0], offset); + c0[2] = amd_bytealign_S (w0[2], w0[3], offset); + c0[1] = amd_bytealign_S (w0[1], w0[2], offset); + c0[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + c0[0] = swap32_S (c0[0]); + c0[1] = swap32_S (c0[1]); + c0[2] = swap32_S (c0[2]); + c0[3] = swap32_S (c0[3]); + c1[0] = swap32_S (c1[0]); + c1[1] = swap32_S (c1[1]); + c1[2] = swap32_S (c1[2]); + c1[3] = swap32_S (c1[3]); + c2[0] = swap32_S (c2[0]); + c2[1] = swap32_S (c2[1]); + c2[2] = swap32_S (c2[2]); + c2[3] = swap32_S (c2[3]); + c3[0] = swap32_S (c3[0]); + c3[1] = swap32_S (c3[1]); + c3[2] = swap32_S (c3[2]); + c3[3] = swap32_S (c3[3]); + #endif + + #ifdef IS_NV + // todo switch (offset / 4) { case 0: @@ -16703,9 +35028,10 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], break; } + #endif } -inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -17360,7 +35686,7 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w #endif } -inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -18287,459 +36613,291 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], #endif } -inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + w4[0] = swap32_S (w4[0]); + w4[1] = swap32_S (w4[1]); + w4[2] = swap32_S (w4[2]); + w4[3] = swap32_S (w4[3]); + w5[0] = swap32_S (w5[0]); + w5[1] = swap32_S (w5[1]); + w5[2] = swap32_S (w5[2]); + w5[3] = swap32_S (w5[3]); + w6[0] = swap32_S (w6[0]); + w6[1] = swap32_S (w6[1]); + w6[2] = swap32_S (w6[2]); + w6[3] = swap32_S (w6[3]); + w7[0] = swap32_S (w7[0]); + w7[1] = swap32_S (w7[1]); + w7[2] = swap32_S (w7[2]); + w7[3] = swap32_S (w7[3]); switch (offset / 4) { - case 0: - w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + case 0: + w7[3] = amd_bytealign_S (w7[2], w7[3], offset); + w7[2] = amd_bytealign_S (w7[1], w7[2], offset); + w7[1] = amd_bytealign_S (w7[0], w7[1], offset); + w7[0] = amd_bytealign_S (w6[3], w7[0], offset); + w6[3] = amd_bytealign_S (w6[2], w6[3], offset); + w6[2] = amd_bytealign_S (w6[1], w6[2], offset); + w6[1] = amd_bytealign_S (w6[0], w6[1], offset); + w6[0] = amd_bytealign_S (w5[3], w6[0], offset); + w5[3] = amd_bytealign_S (w5[2], w5[3], offset); + w5[2] = amd_bytealign_S (w5[1], w5[2], offset); + w5[1] = amd_bytealign_S (w5[0], w5[1], offset); + w5[0] = amd_bytealign_S (w4[3], w5[0], offset); + w4[3] = amd_bytealign_S (w4[2], w4[3], offset); + w4[2] = amd_bytealign_S (w4[1], w4[2], offset); + w4[1] = amd_bytealign_S (w4[0], w4[1], offset); + w4[0] = amd_bytealign_S (w3[3], w4[0], offset); + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; - case 1: - w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 1: + w7[3] = amd_bytealign_S (w7[1], w7[2], offset); + w7[2] = amd_bytealign_S (w7[0], w7[1], offset); + w7[1] = amd_bytealign_S (w6[3], w7[0], offset); + w7[0] = amd_bytealign_S (w6[2], w6[3], offset); + w6[3] = amd_bytealign_S (w6[1], w6[2], offset); + w6[2] = amd_bytealign_S (w6[0], w6[1], offset); + w6[1] = amd_bytealign_S (w5[3], w6[0], offset); + w6[0] = amd_bytealign_S (w5[2], w5[3], offset); + w5[3] = amd_bytealign_S (w5[1], w5[2], offset); + w5[2] = amd_bytealign_S (w5[0], w5[1], offset); + w5[1] = amd_bytealign_S (w4[3], w5[0], offset); + w5[0] = amd_bytealign_S (w4[2], w4[3], offset); + w4[3] = amd_bytealign_S (w4[1], w4[2], offset); + w4[2] = amd_bytealign_S (w4[0], w4[1], offset); + w4[1] = amd_bytealign_S (w3[3], w4[0], offset); + w4[0] = amd_bytealign_S (w3[2], w3[3], offset); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 2: - w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 2: + w7[3] = amd_bytealign_S (w7[0], w7[1], offset); + w7[2] = amd_bytealign_S (w6[3], w7[0], offset); + w7[1] = amd_bytealign_S (w6[2], w6[3], offset); + w7[0] = amd_bytealign_S (w6[1], w6[2], offset); + w6[3] = amd_bytealign_S (w6[0], w6[1], offset); + w6[2] = amd_bytealign_S (w5[3], w6[0], offset); + w6[1] = amd_bytealign_S (w5[2], w5[3], offset); + w6[0] = amd_bytealign_S (w5[1], w5[2], offset); + w5[3] = amd_bytealign_S (w5[0], w5[1], offset); + w5[2] = amd_bytealign_S (w4[3], w5[0], offset); + w5[1] = amd_bytealign_S (w4[2], w4[3], offset); + w5[0] = amd_bytealign_S (w4[1], w4[2], offset); + w4[3] = amd_bytealign_S (w4[0], w4[1], offset); + w4[2] = amd_bytealign_S (w3[3], w4[0], offset); + w4[1] = amd_bytealign_S (w3[2], w3[3], offset); + w4[0] = amd_bytealign_S (w3[1], w3[2], offset); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 3: - w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 3: + w7[3] = amd_bytealign_S (w6[3], w7[0], offset); + w7[2] = amd_bytealign_S (w6[2], w6[3], offset); + w7[1] = amd_bytealign_S (w6[1], w6[2], offset); + w7[0] = amd_bytealign_S (w6[0], w6[1], offset); + w6[3] = amd_bytealign_S (w5[3], w6[0], offset); + w6[2] = amd_bytealign_S (w5[2], w5[3], offset); + w6[1] = amd_bytealign_S (w5[1], w5[2], offset); + w6[0] = amd_bytealign_S (w5[0], w5[1], offset); + w5[3] = amd_bytealign_S (w4[3], w5[0], offset); + w5[2] = amd_bytealign_S (w4[2], w4[3], offset); + w5[1] = amd_bytealign_S (w4[1], w4[2], offset); + w5[0] = amd_bytealign_S (w4[0], w4[1], offset); + w4[3] = amd_bytealign_S (w3[3], w4[0], offset); + w4[2] = amd_bytealign_S (w3[2], w3[3], offset); + w4[1] = amd_bytealign_S (w3[1], w3[2], offset); + w4[0] = amd_bytealign_S (w3[0], w3[1], offset); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 4: - w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 4: + w7[3] = amd_bytealign_S (w6[2], w6[3], offset); + w7[2] = amd_bytealign_S (w6[1], w6[2], offset); + w7[1] = amd_bytealign_S (w6[0], w6[1], offset); + w7[0] = amd_bytealign_S (w5[3], w6[0], offset); + w6[3] = amd_bytealign_S (w5[2], w5[3], offset); + w6[2] = amd_bytealign_S (w5[1], w5[2], offset); + w6[1] = amd_bytealign_S (w5[0], w5[1], offset); + w6[0] = amd_bytealign_S (w4[3], w5[0], offset); + w5[3] = amd_bytealign_S (w4[2], w4[3], offset); + w5[2] = amd_bytealign_S (w4[1], w4[2], offset); + w5[1] = amd_bytealign_S (w4[0], w4[1], offset); + w5[0] = amd_bytealign_S (w3[3], w4[0], offset); + w4[3] = amd_bytealign_S (w3[2], w3[3], offset); + w4[2] = amd_bytealign_S (w3[1], w3[2], offset); + w4[1] = amd_bytealign_S (w3[0], w3[1], offset); + w4[0] = amd_bytealign_S (w2[3], w3[0], offset); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 5: - w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 5: + w7[3] = amd_bytealign_S (w6[1], w6[2], offset); + w7[2] = amd_bytealign_S (w6[0], w6[1], offset); + w7[1] = amd_bytealign_S (w5[3], w6[0], offset); + w7[0] = amd_bytealign_S (w5[2], w5[3], offset); + w6[3] = amd_bytealign_S (w5[1], w5[2], offset); + w6[2] = amd_bytealign_S (w5[0], w5[1], offset); + w6[1] = amd_bytealign_S (w4[3], w5[0], offset); + w6[0] = amd_bytealign_S (w4[2], w4[3], offset); + w5[3] = amd_bytealign_S (w4[1], w4[2], offset); + w5[2] = amd_bytealign_S (w4[0], w4[1], offset); + w5[1] = amd_bytealign_S (w3[3], w4[0], offset); + w5[0] = amd_bytealign_S (w3[2], w3[3], offset); + w4[3] = amd_bytealign_S (w3[1], w3[2], offset); + w4[2] = amd_bytealign_S (w3[0], w3[1], offset); + w4[1] = amd_bytealign_S (w2[3], w3[0], offset); + w4[0] = amd_bytealign_S (w2[2], w2[3], offset); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 6: - w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 6: + w7[3] = amd_bytealign_S (w6[0], w6[1], offset); + w7[2] = amd_bytealign_S (w5[3], w6[0], offset); + w7[1] = amd_bytealign_S (w5[2], w5[3], offset); + w7[0] = amd_bytealign_S (w5[1], w5[2], offset); + w6[3] = amd_bytealign_S (w5[0], w5[1], offset); + w6[2] = amd_bytealign_S (w4[3], w5[0], offset); + w6[1] = amd_bytealign_S (w4[2], w4[3], offset); + w6[0] = amd_bytealign_S (w4[1], w4[2], offset); + w5[3] = amd_bytealign_S (w4[0], w4[1], offset); + w5[2] = amd_bytealign_S (w3[3], w4[0], offset); + w5[1] = amd_bytealign_S (w3[2], w3[3], offset); + w5[0] = amd_bytealign_S (w3[1], w3[2], offset); + w4[3] = amd_bytealign_S (w3[0], w3[1], offset); + w4[2] = amd_bytealign_S (w2[3], w3[0], offset); + w4[1] = amd_bytealign_S (w2[2], w2[3], offset); + w4[0] = amd_bytealign_S (w2[1], w2[2], offset); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -18747,64 +36905,34 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 7: - w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 7: + w7[3] = amd_bytealign_S (w5[3], w6[0], offset); + w7[2] = amd_bytealign_S (w5[2], w5[3], offset); + w7[1] = amd_bytealign_S (w5[1], w5[2], offset); + w7[0] = amd_bytealign_S (w5[0], w5[1], offset); + w6[3] = amd_bytealign_S (w4[3], w5[0], offset); + w6[2] = amd_bytealign_S (w4[2], w4[3], offset); + w6[1] = amd_bytealign_S (w4[1], w4[2], offset); + w6[0] = amd_bytealign_S (w4[0], w4[1], offset); + w5[3] = amd_bytealign_S (w3[3], w4[0], offset); + w5[2] = amd_bytealign_S (w3[2], w3[3], offset); + w5[1] = amd_bytealign_S (w3[1], w3[2], offset); + w5[0] = amd_bytealign_S (w3[0], w3[1], offset); + w4[3] = amd_bytealign_S (w2[3], w3[0], offset); + w4[2] = amd_bytealign_S (w2[2], w2[3], offset); + w4[1] = amd_bytealign_S (w2[1], w2[2], offset); + w4[0] = amd_bytealign_S (w2[0], w2[1], offset); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -18813,62 +36941,33 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 8: - w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 8: + w7[3] = amd_bytealign_S (w5[2], w5[3], offset); + w7[2] = amd_bytealign_S (w5[1], w5[2], offset); + w7[1] = amd_bytealign_S (w5[0], w5[1], offset); + w7[0] = amd_bytealign_S (w4[3], w5[0], offset); + w6[3] = amd_bytealign_S (w4[2], w4[3], offset); + w6[2] = amd_bytealign_S (w4[1], w4[2], offset); + w6[1] = amd_bytealign_S (w4[0], w4[1], offset); + w6[0] = amd_bytealign_S (w3[3], w4[0], offset); + w5[3] = amd_bytealign_S (w3[2], w3[3], offset); + w5[2] = amd_bytealign_S (w3[1], w3[2], offset); + w5[1] = amd_bytealign_S (w3[0], w3[1], offset); + w5[0] = amd_bytealign_S (w2[3], w3[0], offset); + w4[3] = amd_bytealign_S (w2[2], w2[3], offset); + w4[2] = amd_bytealign_S (w2[1], w2[2], offset); + w4[1] = amd_bytealign_S (w2[0], w2[1], offset); + w4[0] = amd_bytealign_S (w1[3], w2[0], offset); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -18878,60 +36977,32 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 9: - w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 9: + w7[3] = amd_bytealign_S (w5[1], w5[2], offset); + w7[2] = amd_bytealign_S (w5[0], w5[1], offset); + w7[1] = amd_bytealign_S (w4[3], w5[0], offset); + w7[0] = amd_bytealign_S (w4[2], w4[3], offset); + w6[3] = amd_bytealign_S (w4[1], w4[2], offset); + w6[2] = amd_bytealign_S (w4[0], w4[1], offset); + w6[1] = amd_bytealign_S (w3[3], w4[0], offset); + w6[0] = amd_bytealign_S (w3[2], w3[3], offset); + w5[3] = amd_bytealign_S (w3[1], w3[2], offset); + w5[2] = amd_bytealign_S (w3[0], w3[1], offset); + w5[1] = amd_bytealign_S (w2[3], w3[0], offset); + w5[0] = amd_bytealign_S (w2[2], w2[3], offset); + w4[3] = amd_bytealign_S (w2[1], w2[2], offset); + w4[2] = amd_bytealign_S (w2[0], w2[1], offset); + w4[1] = amd_bytealign_S (w1[3], w2[0], offset); + w4[0] = amd_bytealign_S (w1[2], w1[3], offset); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -18942,58 +37013,31 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 10: - w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w5[0], w5[1], offset); + w7[2] = amd_bytealign_S (w4[3], w5[0], offset); + w7[1] = amd_bytealign_S (w4[2], w4[3], offset); + w7[0] = amd_bytealign_S (w4[1], w4[2], offset); + w6[3] = amd_bytealign_S (w4[0], w4[1], offset); + w6[2] = amd_bytealign_S (w3[3], w4[0], offset); + w6[1] = amd_bytealign_S (w3[2], w3[3], offset); + w6[0] = amd_bytealign_S (w3[1], w3[2], offset); + w5[3] = amd_bytealign_S (w3[0], w3[1], offset); + w5[2] = amd_bytealign_S (w2[3], w3[0], offset); + w5[1] = amd_bytealign_S (w2[2], w2[3], offset); + w5[0] = amd_bytealign_S (w2[1], w2[2], offset); + w4[3] = amd_bytealign_S (w2[0], w2[1], offset); + w4[2] = amd_bytealign_S (w1[3], w2[0], offset); + w4[1] = amd_bytealign_S (w1[2], w1[3], offset); + w4[0] = amd_bytealign_S (w1[1], w1[2], offset); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -19005,56 +37049,30 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 11: - w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[3], w5[0], offset); + w7[2] = amd_bytealign_S (w4[2], w4[3], offset); + w7[1] = amd_bytealign_S (w4[1], w4[2], offset); + w7[0] = amd_bytealign_S (w4[0], w4[1], offset); + w6[3] = amd_bytealign_S (w3[3], w4[0], offset); + w6[2] = amd_bytealign_S (w3[2], w3[3], offset); + w6[1] = amd_bytealign_S (w3[1], w3[2], offset); + w6[0] = amd_bytealign_S (w3[0], w3[1], offset); + w5[3] = amd_bytealign_S (w2[3], w3[0], offset); + w5[2] = amd_bytealign_S (w2[2], w2[3], offset); + w5[1] = amd_bytealign_S (w2[1], w2[2], offset); + w5[0] = amd_bytealign_S (w2[0], w2[1], offset); + w4[3] = amd_bytealign_S (w1[3], w2[0], offset); + w4[2] = amd_bytealign_S (w1[2], w1[3], offset); + w4[1] = amd_bytealign_S (w1[1], w1[2], offset); + w4[0] = amd_bytealign_S (w1[0], w1[1], offset); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -19067,54 +37085,29 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 12: - w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[2], w4[3], offset); + w7[2] = amd_bytealign_S (w4[1], w4[2], offset); + w7[1] = amd_bytealign_S (w4[0], w4[1], offset); + w7[0] = amd_bytealign_S (w3[3], w4[0], offset); + w6[3] = amd_bytealign_S (w3[2], w3[3], offset); + w6[2] = amd_bytealign_S (w3[1], w3[2], offset); + w6[1] = amd_bytealign_S (w3[0], w3[1], offset); + w6[0] = amd_bytealign_S (w2[3], w3[0], offset); + w5[3] = amd_bytealign_S (w2[2], w2[3], offset); + w5[2] = amd_bytealign_S (w2[1], w2[2], offset); + w5[1] = amd_bytealign_S (w2[0], w2[1], offset); + w5[0] = amd_bytealign_S (w1[3], w2[0], offset); + w4[3] = amd_bytealign_S (w1[2], w1[3], offset); + w4[2] = amd_bytealign_S (w1[1], w1[2], offset); + w4[1] = amd_bytealign_S (w1[0], w1[1], offset); + w4[0] = amd_bytealign_S (w0[3], w1[0], offset); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -19128,52 +37121,28 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 13: - w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[1], w4[2], offset); + w7[2] = amd_bytealign_S (w4[0], w4[1], offset); + w7[1] = amd_bytealign_S (w3[3], w4[0], offset); + w7[0] = amd_bytealign_S (w3[2], w3[3], offset); + w6[3] = amd_bytealign_S (w3[1], w3[2], offset); + w6[2] = amd_bytealign_S (w3[0], w3[1], offset); + w6[1] = amd_bytealign_S (w2[3], w3[0], offset); + w6[0] = amd_bytealign_S (w2[2], w2[3], offset); + w5[3] = amd_bytealign_S (w2[1], w2[2], offset); + w5[2] = amd_bytealign_S (w2[0], w2[1], offset); + w5[1] = amd_bytealign_S (w1[3], w2[0], offset); + w5[0] = amd_bytealign_S (w1[2], w1[3], offset); + w4[3] = amd_bytealign_S (w1[1], w1[2], offset); + w4[2] = amd_bytealign_S (w1[0], w1[1], offset); + w4[1] = amd_bytealign_S (w0[3], w1[0], offset); + w4[0] = amd_bytealign_S (w0[2], w0[3], offset); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -19188,50 +37157,27 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 14: - w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[0], w4[1], offset); + w7[2] = amd_bytealign_S (w3[3], w4[0], offset); + w7[1] = amd_bytealign_S (w3[2], w3[3], offset); + w7[0] = amd_bytealign_S (w3[1], w3[2], offset); + w6[3] = amd_bytealign_S (w3[0], w3[1], offset); + w6[2] = amd_bytealign_S (w2[3], w3[0], offset); + w6[1] = amd_bytealign_S (w2[2], w2[3], offset); + w6[0] = amd_bytealign_S (w2[1], w2[2], offset); + w5[3] = amd_bytealign_S (w2[0], w2[1], offset); + w5[2] = amd_bytealign_S (w1[3], w2[0], offset); + w5[1] = amd_bytealign_S (w1[2], w1[3], offset); + w5[0] = amd_bytealign_S (w1[1], w1[2], offset); + w4[3] = amd_bytealign_S (w1[0], w1[1], offset); + w4[2] = amd_bytealign_S (w0[3], w1[0], offset); + w4[1] = amd_bytealign_S (w0[2], w0[3], offset); + w4[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -19247,48 +37193,26 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 15: - w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w3[3], w4[0], offset); + w7[2] = amd_bytealign_S (w3[2], w3[3], offset); + w7[1] = amd_bytealign_S (w3[1], w3[2], offset); + w7[0] = amd_bytealign_S (w3[0], w3[1], offset); + w6[3] = amd_bytealign_S (w2[3], w3[0], offset); + w6[2] = amd_bytealign_S (w2[2], w2[3], offset); + w6[1] = amd_bytealign_S (w2[1], w2[2], offset); + w6[0] = amd_bytealign_S (w2[0], w2[1], offset); + w5[3] = amd_bytealign_S (w1[3], w2[0], offset); + w5[2] = amd_bytealign_S (w1[2], w1[3], offset); + w5[1] = amd_bytealign_S (w1[1], w1[2], offset); + w5[0] = amd_bytealign_S (w1[0], w1[1], offset); + w4[3] = amd_bytealign_S (w0[3], w1[0], offset); + w4[2] = amd_bytealign_S (w0[2], w0[3], offset); + w4[1] = amd_bytealign_S (w0[1], w0[2], offset); + w4[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -19305,34 +37229,620 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 16: + w7[3] = amd_bytealign_S (w3[2], w3[3], offset); + w7[2] = amd_bytealign_S (w3[1], w3[2], offset); + w7[1] = amd_bytealign_S (w3[0], w3[1], offset); + w7[0] = amd_bytealign_S (w2[3], w3[0], offset); + w6[3] = amd_bytealign_S (w2[2], w2[3], offset); + w6[2] = amd_bytealign_S (w2[1], w2[2], offset); + w6[1] = amd_bytealign_S (w2[0], w2[1], offset); + w6[0] = amd_bytealign_S (w1[3], w2[0], offset); + w5[3] = amd_bytealign_S (w1[2], w1[3], offset); + w5[2] = amd_bytealign_S (w1[1], w1[2], offset); + w5[1] = amd_bytealign_S (w1[0], w1[1], offset); + w5[0] = amd_bytealign_S (w0[3], w1[0], offset); + w4[3] = amd_bytealign_S (w0[2], w0[3], offset); + w4[2] = amd_bytealign_S (w0[1], w0[2], offset); + w4[1] = amd_bytealign_S (w0[0], w0[1], offset); + w4[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + w7[3] = amd_bytealign_S (w3[1], w3[2], offset); + w7[2] = amd_bytealign_S (w3[0], w3[1], offset); + w7[1] = amd_bytealign_S (w2[3], w3[0], offset); + w7[0] = amd_bytealign_S (w2[2], w2[3], offset); + w6[3] = amd_bytealign_S (w2[1], w2[2], offset); + w6[2] = amd_bytealign_S (w2[0], w2[1], offset); + w6[1] = amd_bytealign_S (w1[3], w2[0], offset); + w6[0] = amd_bytealign_S (w1[2], w1[3], offset); + w5[3] = amd_bytealign_S (w1[1], w1[2], offset); + w5[2] = amd_bytealign_S (w1[0], w1[1], offset); + w5[1] = amd_bytealign_S (w0[3], w1[0], offset); + w5[0] = amd_bytealign_S (w0[2], w0[3], offset); + w4[3] = amd_bytealign_S (w0[1], w0[2], offset); + w4[2] = amd_bytealign_S (w0[0], w0[1], offset); + w4[1] = amd_bytealign_S ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + w7[3] = amd_bytealign_S (w3[0], w3[1], offset); + w7[2] = amd_bytealign_S (w2[3], w3[0], offset); + w7[1] = amd_bytealign_S (w2[2], w2[3], offset); + w7[0] = amd_bytealign_S (w2[1], w2[2], offset); + w6[3] = amd_bytealign_S (w2[0], w2[1], offset); + w6[2] = amd_bytealign_S (w1[3], w2[0], offset); + w6[1] = amd_bytealign_S (w1[2], w1[3], offset); + w6[0] = amd_bytealign_S (w1[1], w1[2], offset); + w5[3] = amd_bytealign_S (w1[0], w1[1], offset); + w5[2] = amd_bytealign_S (w0[3], w1[0], offset); + w5[1] = amd_bytealign_S (w0[2], w0[3], offset); + w5[0] = amd_bytealign_S (w0[1], w0[2], offset); + w4[3] = amd_bytealign_S (w0[0], w0[1], offset); + w4[2] = amd_bytealign_S ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + w7[3] = amd_bytealign_S (w2[3], w3[0], offset); + w7[2] = amd_bytealign_S (w2[2], w2[3], offset); + w7[1] = amd_bytealign_S (w2[1], w2[2], offset); + w7[0] = amd_bytealign_S (w2[0], w2[1], offset); + w6[3] = amd_bytealign_S (w1[3], w2[0], offset); + w6[2] = amd_bytealign_S (w1[2], w1[3], offset); + w6[1] = amd_bytealign_S (w1[1], w1[2], offset); + w6[0] = amd_bytealign_S (w1[0], w1[1], offset); + w5[3] = amd_bytealign_S (w0[3], w1[0], offset); + w5[2] = amd_bytealign_S (w0[2], w0[3], offset); + w5[1] = amd_bytealign_S (w0[1], w0[2], offset); + w5[0] = amd_bytealign_S (w0[0], w0[1], offset); + w4[3] = amd_bytealign_S ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + w7[3] = amd_bytealign_S (w2[2], w2[3], offset); + w7[2] = amd_bytealign_S (w2[1], w2[2], offset); + w7[1] = amd_bytealign_S (w2[0], w2[1], offset); + w7[0] = amd_bytealign_S (w1[3], w2[0], offset); + w6[3] = amd_bytealign_S (w1[2], w1[3], offset); + w6[2] = amd_bytealign_S (w1[1], w1[2], offset); + w6[1] = amd_bytealign_S (w1[0], w1[1], offset); + w6[0] = amd_bytealign_S (w0[3], w1[0], offset); + w5[3] = amd_bytealign_S (w0[2], w0[3], offset); + w5[2] = amd_bytealign_S (w0[1], w0[2], offset); + w5[1] = amd_bytealign_S (w0[0], w0[1], offset); + w5[0] = amd_bytealign_S ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + w7[3] = amd_bytealign_S (w2[1], w2[2], offset); + w7[2] = amd_bytealign_S (w2[0], w2[1], offset); + w7[1] = amd_bytealign_S (w1[3], w2[0], offset); + w7[0] = amd_bytealign_S (w1[2], w1[3], offset); + w6[3] = amd_bytealign_S (w1[1], w1[2], offset); + w6[2] = amd_bytealign_S (w1[0], w1[1], offset); + w6[1] = amd_bytealign_S (w0[3], w1[0], offset); + w6[0] = amd_bytealign_S (w0[2], w0[3], offset); + w5[3] = amd_bytealign_S (w0[1], w0[2], offset); + w5[2] = amd_bytealign_S (w0[0], w0[1], offset); + w5[1] = amd_bytealign_S ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + w7[3] = amd_bytealign_S (w2[0], w2[1], offset); + w7[2] = amd_bytealign_S (w1[3], w2[0], offset); + w7[1] = amd_bytealign_S (w1[2], w1[3], offset); + w7[0] = amd_bytealign_S (w1[1], w1[2], offset); + w6[3] = amd_bytealign_S (w1[0], w1[1], offset); + w6[2] = amd_bytealign_S (w0[3], w1[0], offset); + w6[1] = amd_bytealign_S (w0[2], w0[3], offset); + w6[0] = amd_bytealign_S (w0[1], w0[2], offset); + w5[3] = amd_bytealign_S (w0[0], w0[1], offset); + w5[2] = amd_bytealign_S ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + w7[3] = amd_bytealign_S (w1[3], w2[0], offset); + w7[2] = amd_bytealign_S (w1[2], w1[3], offset); + w7[1] = amd_bytealign_S (w1[1], w1[2], offset); + w7[0] = amd_bytealign_S (w1[0], w1[1], offset); + w6[3] = amd_bytealign_S (w0[3], w1[0], offset); + w6[2] = amd_bytealign_S (w0[2], w0[3], offset); + w6[1] = amd_bytealign_S (w0[1], w0[2], offset); + w6[0] = amd_bytealign_S (w0[0], w0[1], offset); + w5[3] = amd_bytealign_S ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + w7[3] = amd_bytealign_S (w1[2], w1[3], offset); + w7[2] = amd_bytealign_S (w1[1], w1[2], offset); + w7[1] = amd_bytealign_S (w1[0], w1[1], offset); + w7[0] = amd_bytealign_S (w0[3], w1[0], offset); + w6[3] = amd_bytealign_S (w0[2], w0[3], offset); + w6[2] = amd_bytealign_S (w0[1], w0[2], offset); + w6[1] = amd_bytealign_S (w0[0], w0[1], offset); + w6[0] = amd_bytealign_S ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + w7[3] = amd_bytealign_S (w1[1], w1[2], offset); + w7[2] = amd_bytealign_S (w1[0], w1[1], offset); + w7[1] = amd_bytealign_S (w0[3], w1[0], offset); + w7[0] = amd_bytealign_S (w0[2], w0[3], offset); + w6[3] = amd_bytealign_S (w0[1], w0[2], offset); + w6[2] = amd_bytealign_S (w0[0], w0[1], offset); + w6[1] = amd_bytealign_S ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + w7[3] = amd_bytealign_S (w1[0], w1[1], offset); + w7[2] = amd_bytealign_S (w0[3], w1[0], offset); + w7[1] = amd_bytealign_S (w0[2], w0[3], offset); + w7[0] = amd_bytealign_S (w0[1], w0[2], offset); + w6[3] = amd_bytealign_S (w0[0], w0[1], offset); + w6[2] = amd_bytealign_S ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + w7[3] = amd_bytealign_S (w0[3], w1[0], offset); + w7[2] = amd_bytealign_S (w0[2], w0[3], offset); + w7[1] = amd_bytealign_S (w0[1], w0[2], offset); + w7[0] = amd_bytealign_S (w0[0], w0[1], offset); + w6[3] = amd_bytealign_S ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + w7[3] = amd_bytealign_S (w0[2], w0[3], offset); + w7[2] = amd_bytealign_S (w0[1], w0[2], offset); + w7[1] = amd_bytealign_S (w0[0], w0[1], offset); + w7[0] = amd_bytealign_S ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + w7[3] = amd_bytealign_S (w0[1], w0[2], offset); + w7[2] = amd_bytealign_S (w0[0], w0[1], offset); + w7[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + w7[3] = amd_bytealign_S (w0[0], w0[1], offset); + w7[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + w7[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + w4[0] = swap32_S (w4[0]); + w4[1] = swap32_S (w4[1]); + w4[2] = swap32_S (w4[2]); + w4[3] = swap32_S (w4[3]); + w5[0] = swap32_S (w5[0]); + w5[1] = swap32_S (w5[1]); + w5[2] = swap32_S (w5[2]); + w5[3] = swap32_S (w5[3]); + w6[0] = swap32_S (w6[0]); + w6[1] = swap32_S (w6[1]); + w6[2] = swap32_S (w6[2]); + w6[3] = swap32_S (w6[3]); + w7[0] = swap32_S (w7[0]); + w7[1] = swap32_S (w7[1]); + w7[2] = swap32_S (w7[2]); + w7[3] = swap32_S (w7[3]); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -19900,7 +38410,7 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -22219,7 +40729,7 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) +void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -25594,875 +44104,486 @@ inline void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2 #endif } -inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) +void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); switch (offset / 4) { case 0: - w[63] = amd_bytealign_S (w[63], w[62], offset_minus_4); - w[62] = amd_bytealign_S (w[62], w[61], offset_minus_4); - w[61] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[60] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[59] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[58] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[57] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[56] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[55] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[54] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[53] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[52] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[51] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[50] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[49] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[48] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[47] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[46] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[45] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[44] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[43] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[42] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[41] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[40] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[39] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[38] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[37] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[36] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[35] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[34] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[33] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[32] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[31] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[30] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[29] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[28] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[27] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[26] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[25] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[24] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[23] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[22] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[21] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[20] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[19] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[18] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[17] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[16] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[15] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[14] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[13] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[12] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[11] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[10] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 1] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 0] = amd_bytealign_S (w[ 0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w[ 0] = w[ 1]; - w[ 1] = w[ 2]; - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } + w[63] = amd_bytealign_S (w[62], w[63], offset); + w[62] = amd_bytealign_S (w[61], w[62], offset); + w[61] = amd_bytealign_S (w[60], w[61], offset); + w[60] = amd_bytealign_S (w[59], w[60], offset); + w[59] = amd_bytealign_S (w[58], w[59], offset); + w[58] = amd_bytealign_S (w[57], w[58], offset); + w[57] = amd_bytealign_S (w[56], w[57], offset); + w[56] = amd_bytealign_S (w[55], w[56], offset); + w[55] = amd_bytealign_S (w[54], w[55], offset); + w[54] = amd_bytealign_S (w[53], w[54], offset); + w[53] = amd_bytealign_S (w[52], w[53], offset); + w[52] = amd_bytealign_S (w[51], w[52], offset); + w[51] = amd_bytealign_S (w[50], w[51], offset); + w[50] = amd_bytealign_S (w[49], w[50], offset); + w[49] = amd_bytealign_S (w[48], w[49], offset); + w[48] = amd_bytealign_S (w[47], w[48], offset); + w[47] = amd_bytealign_S (w[46], w[47], offset); + w[46] = amd_bytealign_S (w[45], w[46], offset); + w[45] = amd_bytealign_S (w[44], w[45], offset); + w[44] = amd_bytealign_S (w[43], w[44], offset); + w[43] = amd_bytealign_S (w[42], w[43], offset); + w[42] = amd_bytealign_S (w[41], w[42], offset); + w[41] = amd_bytealign_S (w[40], w[41], offset); + w[40] = amd_bytealign_S (w[39], w[40], offset); + w[39] = amd_bytealign_S (w[38], w[39], offset); + w[38] = amd_bytealign_S (w[37], w[38], offset); + w[37] = amd_bytealign_S (w[36], w[37], offset); + w[36] = amd_bytealign_S (w[35], w[36], offset); + w[35] = amd_bytealign_S (w[34], w[35], offset); + w[34] = amd_bytealign_S (w[33], w[34], offset); + w[33] = amd_bytealign_S (w[32], w[33], offset); + w[32] = amd_bytealign_S (w[31], w[32], offset); + w[31] = amd_bytealign_S (w[30], w[31], offset); + w[30] = amd_bytealign_S (w[29], w[30], offset); + w[29] = amd_bytealign_S (w[28], w[29], offset); + w[28] = amd_bytealign_S (w[27], w[28], offset); + w[27] = amd_bytealign_S (w[26], w[27], offset); + w[26] = amd_bytealign_S (w[25], w[26], offset); + w[25] = amd_bytealign_S (w[24], w[25], offset); + w[24] = amd_bytealign_S (w[23], w[24], offset); + w[23] = amd_bytealign_S (w[22], w[23], offset); + w[22] = amd_bytealign_S (w[21], w[22], offset); + w[21] = amd_bytealign_S (w[20], w[21], offset); + w[20] = amd_bytealign_S (w[19], w[20], offset); + w[19] = amd_bytealign_S (w[18], w[19], offset); + w[18] = amd_bytealign_S (w[17], w[18], offset); + w[17] = amd_bytealign_S (w[16], w[17], offset); + w[16] = amd_bytealign_S (w[15], w[16], offset); + w[15] = amd_bytealign_S (w[14], w[15], offset); + w[14] = amd_bytealign_S (w[13], w[14], offset); + w[13] = amd_bytealign_S (w[12], w[13], offset); + w[12] = amd_bytealign_S (w[11], w[12], offset); + w[11] = amd_bytealign_S (w[10], w[11], offset); + w[10] = amd_bytealign_S (w[ 9], w[10], offset); + w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); break; case 1: - w[63] = amd_bytealign_S (w[62], w[61], offset_minus_4); - w[62] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[61] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[60] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[59] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[58] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[57] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[56] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[55] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[54] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[53] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[52] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[51] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[50] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[49] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[48] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[47] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[46] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[45] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[44] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[43] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[42] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[41] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[40] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[39] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[38] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[37] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[36] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[35] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[34] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[33] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[32] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[31] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[30] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[29] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[28] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[27] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[26] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[25] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[24] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[23] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[22] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[21] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[20] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[19] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[18] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[17] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[16] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[15] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[14] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[13] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[12] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[11] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[10] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 1] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[61], w[62], offset); + w[62] = amd_bytealign_S (w[60], w[61], offset); + w[61] = amd_bytealign_S (w[59], w[60], offset); + w[60] = amd_bytealign_S (w[58], w[59], offset); + w[59] = amd_bytealign_S (w[57], w[58], offset); + w[58] = amd_bytealign_S (w[56], w[57], offset); + w[57] = amd_bytealign_S (w[55], w[56], offset); + w[56] = amd_bytealign_S (w[54], w[55], offset); + w[55] = amd_bytealign_S (w[53], w[54], offset); + w[54] = amd_bytealign_S (w[52], w[53], offset); + w[53] = amd_bytealign_S (w[51], w[52], offset); + w[52] = amd_bytealign_S (w[50], w[51], offset); + w[51] = amd_bytealign_S (w[49], w[50], offset); + w[50] = amd_bytealign_S (w[48], w[49], offset); + w[49] = amd_bytealign_S (w[47], w[48], offset); + w[48] = amd_bytealign_S (w[46], w[47], offset); + w[47] = amd_bytealign_S (w[45], w[46], offset); + w[46] = amd_bytealign_S (w[44], w[45], offset); + w[45] = amd_bytealign_S (w[43], w[44], offset); + w[44] = amd_bytealign_S (w[42], w[43], offset); + w[43] = amd_bytealign_S (w[41], w[42], offset); + w[42] = amd_bytealign_S (w[40], w[41], offset); + w[41] = amd_bytealign_S (w[39], w[40], offset); + w[40] = amd_bytealign_S (w[38], w[39], offset); + w[39] = amd_bytealign_S (w[37], w[38], offset); + w[38] = amd_bytealign_S (w[36], w[37], offset); + w[37] = amd_bytealign_S (w[35], w[36], offset); + w[36] = amd_bytealign_S (w[34], w[35], offset); + w[35] = amd_bytealign_S (w[33], w[34], offset); + w[34] = amd_bytealign_S (w[32], w[33], offset); + w[33] = amd_bytealign_S (w[31], w[32], offset); + w[32] = amd_bytealign_S (w[30], w[31], offset); + w[31] = amd_bytealign_S (w[29], w[30], offset); + w[30] = amd_bytealign_S (w[28], w[29], offset); + w[29] = amd_bytealign_S (w[27], w[28], offset); + w[28] = amd_bytealign_S (w[26], w[27], offset); + w[27] = amd_bytealign_S (w[25], w[26], offset); + w[26] = amd_bytealign_S (w[24], w[25], offset); + w[25] = amd_bytealign_S (w[23], w[24], offset); + w[24] = amd_bytealign_S (w[22], w[23], offset); + w[23] = amd_bytealign_S (w[21], w[22], offset); + w[22] = amd_bytealign_S (w[20], w[21], offset); + w[21] = amd_bytealign_S (w[19], w[20], offset); + w[20] = amd_bytealign_S (w[18], w[19], offset); + w[19] = amd_bytealign_S (w[17], w[18], offset); + w[18] = amd_bytealign_S (w[16], w[17], offset); + w[17] = amd_bytealign_S (w[15], w[16], offset); + w[16] = amd_bytealign_S (w[14], w[15], offset); + w[15] = amd_bytealign_S (w[13], w[14], offset); + w[14] = amd_bytealign_S (w[12], w[13], offset); + w[13] = amd_bytealign_S (w[11], w[12], offset); + w[12] = amd_bytealign_S (w[10], w[11], offset); + w[11] = amd_bytealign_S (w[ 9], w[10], offset); + w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 1] = w[ 2]; - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 2: - w[63] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[62] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[61] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[60] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[59] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[58] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[57] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[56] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[55] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[54] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[53] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[52] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[51] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[50] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[49] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[48] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[47] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[46] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[45] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[44] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[43] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[42] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[41] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[40] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[39] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[38] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[37] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[36] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[35] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[34] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[33] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[32] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[31] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[30] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[29] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[28] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[27] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[26] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[25] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[24] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[23] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[22] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[21] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[20] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[19] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[18] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[17] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[16] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[15] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[14] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[13] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[12] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[11] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[10] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[60], w[61], offset); + w[62] = amd_bytealign_S (w[59], w[60], offset); + w[61] = amd_bytealign_S (w[58], w[59], offset); + w[60] = amd_bytealign_S (w[57], w[58], offset); + w[59] = amd_bytealign_S (w[56], w[57], offset); + w[58] = amd_bytealign_S (w[55], w[56], offset); + w[57] = amd_bytealign_S (w[54], w[55], offset); + w[56] = amd_bytealign_S (w[53], w[54], offset); + w[55] = amd_bytealign_S (w[52], w[53], offset); + w[54] = amd_bytealign_S (w[51], w[52], offset); + w[53] = amd_bytealign_S (w[50], w[51], offset); + w[52] = amd_bytealign_S (w[49], w[50], offset); + w[51] = amd_bytealign_S (w[48], w[49], offset); + w[50] = amd_bytealign_S (w[47], w[48], offset); + w[49] = amd_bytealign_S (w[46], w[47], offset); + w[48] = amd_bytealign_S (w[45], w[46], offset); + w[47] = amd_bytealign_S (w[44], w[45], offset); + w[46] = amd_bytealign_S (w[43], w[44], offset); + w[45] = amd_bytealign_S (w[42], w[43], offset); + w[44] = amd_bytealign_S (w[41], w[42], offset); + w[43] = amd_bytealign_S (w[40], w[41], offset); + w[42] = amd_bytealign_S (w[39], w[40], offset); + w[41] = amd_bytealign_S (w[38], w[39], offset); + w[40] = amd_bytealign_S (w[37], w[38], offset); + w[39] = amd_bytealign_S (w[36], w[37], offset); + w[38] = amd_bytealign_S (w[35], w[36], offset); + w[37] = amd_bytealign_S (w[34], w[35], offset); + w[36] = amd_bytealign_S (w[33], w[34], offset); + w[35] = amd_bytealign_S (w[32], w[33], offset); + w[34] = amd_bytealign_S (w[31], w[32], offset); + w[33] = amd_bytealign_S (w[30], w[31], offset); + w[32] = amd_bytealign_S (w[29], w[30], offset); + w[31] = amd_bytealign_S (w[28], w[29], offset); + w[30] = amd_bytealign_S (w[27], w[28], offset); + w[29] = amd_bytealign_S (w[26], w[27], offset); + w[28] = amd_bytealign_S (w[25], w[26], offset); + w[27] = amd_bytealign_S (w[24], w[25], offset); + w[26] = amd_bytealign_S (w[23], w[24], offset); + w[25] = amd_bytealign_S (w[22], w[23], offset); + w[24] = amd_bytealign_S (w[21], w[22], offset); + w[23] = amd_bytealign_S (w[20], w[21], offset); + w[22] = amd_bytealign_S (w[19], w[20], offset); + w[21] = amd_bytealign_S (w[18], w[19], offset); + w[20] = amd_bytealign_S (w[17], w[18], offset); + w[19] = amd_bytealign_S (w[16], w[17], offset); + w[18] = amd_bytealign_S (w[15], w[16], offset); + w[17] = amd_bytealign_S (w[14], w[15], offset); + w[16] = amd_bytealign_S (w[13], w[14], offset); + w[15] = amd_bytealign_S (w[12], w[13], offset); + w[14] = amd_bytealign_S (w[11], w[12], offset); + w[13] = amd_bytealign_S (w[10], w[11], offset); + w[12] = amd_bytealign_S (w[ 9], w[10], offset); + w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 3: - w[63] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[62] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[61] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[60] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[59] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[58] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[57] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[56] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[55] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[54] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[53] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[52] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[51] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[50] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[49] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[48] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[47] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[46] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[45] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[44] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[43] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[42] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[41] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[40] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[39] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[38] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[37] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[36] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[35] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[34] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[33] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[32] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[31] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[30] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[29] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[28] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[27] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[26] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[25] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[24] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[23] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[22] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[21] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[20] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[19] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[18] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[17] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[16] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[15] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[14] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[13] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[12] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[11] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[10] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[59], w[60], offset); + w[62] = amd_bytealign_S (w[58], w[59], offset); + w[61] = amd_bytealign_S (w[57], w[58], offset); + w[60] = amd_bytealign_S (w[56], w[57], offset); + w[59] = amd_bytealign_S (w[55], w[56], offset); + w[58] = amd_bytealign_S (w[54], w[55], offset); + w[57] = amd_bytealign_S (w[53], w[54], offset); + w[56] = amd_bytealign_S (w[52], w[53], offset); + w[55] = amd_bytealign_S (w[51], w[52], offset); + w[54] = amd_bytealign_S (w[50], w[51], offset); + w[53] = amd_bytealign_S (w[49], w[50], offset); + w[52] = amd_bytealign_S (w[48], w[49], offset); + w[51] = amd_bytealign_S (w[47], w[48], offset); + w[50] = amd_bytealign_S (w[46], w[47], offset); + w[49] = amd_bytealign_S (w[45], w[46], offset); + w[48] = amd_bytealign_S (w[44], w[45], offset); + w[47] = amd_bytealign_S (w[43], w[44], offset); + w[46] = amd_bytealign_S (w[42], w[43], offset); + w[45] = amd_bytealign_S (w[41], w[42], offset); + w[44] = amd_bytealign_S (w[40], w[41], offset); + w[43] = amd_bytealign_S (w[39], w[40], offset); + w[42] = amd_bytealign_S (w[38], w[39], offset); + w[41] = amd_bytealign_S (w[37], w[38], offset); + w[40] = amd_bytealign_S (w[36], w[37], offset); + w[39] = amd_bytealign_S (w[35], w[36], offset); + w[38] = amd_bytealign_S (w[34], w[35], offset); + w[37] = amd_bytealign_S (w[33], w[34], offset); + w[36] = amd_bytealign_S (w[32], w[33], offset); + w[35] = amd_bytealign_S (w[31], w[32], offset); + w[34] = amd_bytealign_S (w[30], w[31], offset); + w[33] = amd_bytealign_S (w[29], w[30], offset); + w[32] = amd_bytealign_S (w[28], w[29], offset); + w[31] = amd_bytealign_S (w[27], w[28], offset); + w[30] = amd_bytealign_S (w[26], w[27], offset); + w[29] = amd_bytealign_S (w[25], w[26], offset); + w[28] = amd_bytealign_S (w[24], w[25], offset); + w[27] = amd_bytealign_S (w[23], w[24], offset); + w[26] = amd_bytealign_S (w[22], w[23], offset); + w[25] = amd_bytealign_S (w[21], w[22], offset); + w[24] = amd_bytealign_S (w[20], w[21], offset); + w[23] = amd_bytealign_S (w[19], w[20], offset); + w[22] = amd_bytealign_S (w[18], w[19], offset); + w[21] = amd_bytealign_S (w[17], w[18], offset); + w[20] = amd_bytealign_S (w[16], w[17], offset); + w[19] = amd_bytealign_S (w[15], w[16], offset); + w[18] = amd_bytealign_S (w[14], w[15], offset); + w[17] = amd_bytealign_S (w[13], w[14], offset); + w[16] = amd_bytealign_S (w[12], w[13], offset); + w[15] = amd_bytealign_S (w[11], w[12], offset); + w[14] = amd_bytealign_S (w[10], w[11], offset); + w[13] = amd_bytealign_S (w[ 9], w[10], offset); + w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 4: - w[63] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[62] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[61] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[60] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[59] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[58] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[57] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[56] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[55] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[54] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[53] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[52] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[51] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[50] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[49] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[48] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[47] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[46] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[45] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[44] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[43] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[42] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[41] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[40] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[39] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[38] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[37] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[36] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[35] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[34] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[33] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[32] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[31] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[30] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[29] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[28] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[27] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[26] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[25] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[24] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[23] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[22] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[21] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[20] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[19] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[18] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[17] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[16] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[15] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[14] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[13] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[12] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[11] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[10] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[58], w[59], offset); + w[62] = amd_bytealign_S (w[57], w[58], offset); + w[61] = amd_bytealign_S (w[56], w[57], offset); + w[60] = amd_bytealign_S (w[55], w[56], offset); + w[59] = amd_bytealign_S (w[54], w[55], offset); + w[58] = amd_bytealign_S (w[53], w[54], offset); + w[57] = amd_bytealign_S (w[52], w[53], offset); + w[56] = amd_bytealign_S (w[51], w[52], offset); + w[55] = amd_bytealign_S (w[50], w[51], offset); + w[54] = amd_bytealign_S (w[49], w[50], offset); + w[53] = amd_bytealign_S (w[48], w[49], offset); + w[52] = amd_bytealign_S (w[47], w[48], offset); + w[51] = amd_bytealign_S (w[46], w[47], offset); + w[50] = amd_bytealign_S (w[45], w[46], offset); + w[49] = amd_bytealign_S (w[44], w[45], offset); + w[48] = amd_bytealign_S (w[43], w[44], offset); + w[47] = amd_bytealign_S (w[42], w[43], offset); + w[46] = amd_bytealign_S (w[41], w[42], offset); + w[45] = amd_bytealign_S (w[40], w[41], offset); + w[44] = amd_bytealign_S (w[39], w[40], offset); + w[43] = amd_bytealign_S (w[38], w[39], offset); + w[42] = amd_bytealign_S (w[37], w[38], offset); + w[41] = amd_bytealign_S (w[36], w[37], offset); + w[40] = amd_bytealign_S (w[35], w[36], offset); + w[39] = amd_bytealign_S (w[34], w[35], offset); + w[38] = amd_bytealign_S (w[33], w[34], offset); + w[37] = amd_bytealign_S (w[32], w[33], offset); + w[36] = amd_bytealign_S (w[31], w[32], offset); + w[35] = amd_bytealign_S (w[30], w[31], offset); + w[34] = amd_bytealign_S (w[29], w[30], offset); + w[33] = amd_bytealign_S (w[28], w[29], offset); + w[32] = amd_bytealign_S (w[27], w[28], offset); + w[31] = amd_bytealign_S (w[26], w[27], offset); + w[30] = amd_bytealign_S (w[25], w[26], offset); + w[29] = amd_bytealign_S (w[24], w[25], offset); + w[28] = amd_bytealign_S (w[23], w[24], offset); + w[27] = amd_bytealign_S (w[22], w[23], offset); + w[26] = amd_bytealign_S (w[21], w[22], offset); + w[25] = amd_bytealign_S (w[20], w[21], offset); + w[24] = amd_bytealign_S (w[19], w[20], offset); + w[23] = amd_bytealign_S (w[18], w[19], offset); + w[22] = amd_bytealign_S (w[17], w[18], offset); + w[21] = amd_bytealign_S (w[16], w[17], offset); + w[20] = amd_bytealign_S (w[15], w[16], offset); + w[19] = amd_bytealign_S (w[14], w[15], offset); + w[18] = amd_bytealign_S (w[13], w[14], offset); + w[17] = amd_bytealign_S (w[12], w[13], offset); + w[16] = amd_bytealign_S (w[11], w[12], offset); + w[15] = amd_bytealign_S (w[10], w[11], offset); + w[14] = amd_bytealign_S (w[ 9], w[10], offset); + w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 5: - w[63] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[62] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[61] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[60] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[59] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[58] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[57] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[56] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[55] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[54] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[53] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[52] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[51] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[50] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[49] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[48] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[47] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[46] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[45] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[44] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[43] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[42] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[41] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[40] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[39] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[38] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[37] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[36] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[35] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[34] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[33] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[32] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[31] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[30] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[29] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[28] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[27] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[26] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[25] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[24] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[23] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[22] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[21] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[20] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[19] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[18] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[17] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[16] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[15] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[14] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[13] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[12] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[11] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[10] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[57], w[58], offset); + w[62] = amd_bytealign_S (w[56], w[57], offset); + w[61] = amd_bytealign_S (w[55], w[56], offset); + w[60] = amd_bytealign_S (w[54], w[55], offset); + w[59] = amd_bytealign_S (w[53], w[54], offset); + w[58] = amd_bytealign_S (w[52], w[53], offset); + w[57] = amd_bytealign_S (w[51], w[52], offset); + w[56] = amd_bytealign_S (w[50], w[51], offset); + w[55] = amd_bytealign_S (w[49], w[50], offset); + w[54] = amd_bytealign_S (w[48], w[49], offset); + w[53] = amd_bytealign_S (w[47], w[48], offset); + w[52] = amd_bytealign_S (w[46], w[47], offset); + w[51] = amd_bytealign_S (w[45], w[46], offset); + w[50] = amd_bytealign_S (w[44], w[45], offset); + w[49] = amd_bytealign_S (w[43], w[44], offset); + w[48] = amd_bytealign_S (w[42], w[43], offset); + w[47] = amd_bytealign_S (w[41], w[42], offset); + w[46] = amd_bytealign_S (w[40], w[41], offset); + w[45] = amd_bytealign_S (w[39], w[40], offset); + w[44] = amd_bytealign_S (w[38], w[39], offset); + w[43] = amd_bytealign_S (w[37], w[38], offset); + w[42] = amd_bytealign_S (w[36], w[37], offset); + w[41] = amd_bytealign_S (w[35], w[36], offset); + w[40] = amd_bytealign_S (w[34], w[35], offset); + w[39] = amd_bytealign_S (w[33], w[34], offset); + w[38] = amd_bytealign_S (w[32], w[33], offset); + w[37] = amd_bytealign_S (w[31], w[32], offset); + w[36] = amd_bytealign_S (w[30], w[31], offset); + w[35] = amd_bytealign_S (w[29], w[30], offset); + w[34] = amd_bytealign_S (w[28], w[29], offset); + w[33] = amd_bytealign_S (w[27], w[28], offset); + w[32] = amd_bytealign_S (w[26], w[27], offset); + w[31] = amd_bytealign_S (w[25], w[26], offset); + w[30] = amd_bytealign_S (w[24], w[25], offset); + w[29] = amd_bytealign_S (w[23], w[24], offset); + w[28] = amd_bytealign_S (w[22], w[23], offset); + w[27] = amd_bytealign_S (w[21], w[22], offset); + w[26] = amd_bytealign_S (w[20], w[21], offset); + w[25] = amd_bytealign_S (w[19], w[20], offset); + w[24] = amd_bytealign_S (w[18], w[19], offset); + w[23] = amd_bytealign_S (w[17], w[18], offset); + w[22] = amd_bytealign_S (w[16], w[17], offset); + w[21] = amd_bytealign_S (w[15], w[16], offset); + w[20] = amd_bytealign_S (w[14], w[15], offset); + w[19] = amd_bytealign_S (w[13], w[14], offset); + w[18] = amd_bytealign_S (w[12], w[13], offset); + w[17] = amd_bytealign_S (w[11], w[12], offset); + w[16] = amd_bytealign_S (w[10], w[11], offset); + w[15] = amd_bytealign_S (w[ 9], w[10], offset); + w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 6: - w[63] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[62] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[61] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[60] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[59] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[58] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[57] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[56] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[55] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[54] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[53] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[52] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[51] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[50] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[49] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[48] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[47] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[46] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[45] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[44] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[43] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[42] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[41] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[40] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[39] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[38] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[37] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[36] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[35] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[34] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[33] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[32] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[31] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[30] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[29] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[28] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[27] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[26] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[25] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[24] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[23] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[22] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[21] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[20] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[19] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[18] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[17] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[16] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[15] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[14] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[13] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[12] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[11] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[10] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[56], w[57], offset); + w[62] = amd_bytealign_S (w[55], w[56], offset); + w[61] = amd_bytealign_S (w[54], w[55], offset); + w[60] = amd_bytealign_S (w[53], w[54], offset); + w[59] = amd_bytealign_S (w[52], w[53], offset); + w[58] = amd_bytealign_S (w[51], w[52], offset); + w[57] = amd_bytealign_S (w[50], w[51], offset); + w[56] = amd_bytealign_S (w[49], w[50], offset); + w[55] = amd_bytealign_S (w[48], w[49], offset); + w[54] = amd_bytealign_S (w[47], w[48], offset); + w[53] = amd_bytealign_S (w[46], w[47], offset); + w[52] = amd_bytealign_S (w[45], w[46], offset); + w[51] = amd_bytealign_S (w[44], w[45], offset); + w[50] = amd_bytealign_S (w[43], w[44], offset); + w[49] = amd_bytealign_S (w[42], w[43], offset); + w[48] = amd_bytealign_S (w[41], w[42], offset); + w[47] = amd_bytealign_S (w[40], w[41], offset); + w[46] = amd_bytealign_S (w[39], w[40], offset); + w[45] = amd_bytealign_S (w[38], w[39], offset); + w[44] = amd_bytealign_S (w[37], w[38], offset); + w[43] = amd_bytealign_S (w[36], w[37], offset); + w[42] = amd_bytealign_S (w[35], w[36], offset); + w[41] = amd_bytealign_S (w[34], w[35], offset); + w[40] = amd_bytealign_S (w[33], w[34], offset); + w[39] = amd_bytealign_S (w[32], w[33], offset); + w[38] = amd_bytealign_S (w[31], w[32], offset); + w[37] = amd_bytealign_S (w[30], w[31], offset); + w[36] = amd_bytealign_S (w[29], w[30], offset); + w[35] = amd_bytealign_S (w[28], w[29], offset); + w[34] = amd_bytealign_S (w[27], w[28], offset); + w[33] = amd_bytealign_S (w[26], w[27], offset); + w[32] = amd_bytealign_S (w[25], w[26], offset); + w[31] = amd_bytealign_S (w[24], w[25], offset); + w[30] = amd_bytealign_S (w[23], w[24], offset); + w[29] = amd_bytealign_S (w[22], w[23], offset); + w[28] = amd_bytealign_S (w[21], w[22], offset); + w[27] = amd_bytealign_S (w[20], w[21], offset); + w[26] = amd_bytealign_S (w[19], w[20], offset); + w[25] = amd_bytealign_S (w[18], w[19], offset); + w[24] = amd_bytealign_S (w[17], w[18], offset); + w[23] = amd_bytealign_S (w[16], w[17], offset); + w[22] = amd_bytealign_S (w[15], w[16], offset); + w[21] = amd_bytealign_S (w[14], w[15], offset); + w[20] = amd_bytealign_S (w[13], w[14], offset); + w[19] = amd_bytealign_S (w[12], w[13], offset); + w[18] = amd_bytealign_S (w[11], w[12], offset); + w[17] = amd_bytealign_S (w[10], w[11], offset); + w[16] = amd_bytealign_S (w[ 9], w[10], offset); + w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -26470,128 +44591,66 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 7: - w[63] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[62] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[61] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[60] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[59] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[58] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[57] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[56] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[55] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[54] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[53] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[52] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[51] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[50] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[49] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[48] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[47] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[46] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[45] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[44] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[43] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[42] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[41] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[40] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[39] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[38] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[37] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[36] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[35] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[34] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[33] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[32] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[31] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[30] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[29] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[28] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[27] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[26] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[25] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[24] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[23] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[22] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[21] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[20] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[19] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[18] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[17] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[16] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[15] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[14] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[13] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[12] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[11] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[10] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[55], w[56], offset); + w[62] = amd_bytealign_S (w[54], w[55], offset); + w[61] = amd_bytealign_S (w[53], w[54], offset); + w[60] = amd_bytealign_S (w[52], w[53], offset); + w[59] = amd_bytealign_S (w[51], w[52], offset); + w[58] = amd_bytealign_S (w[50], w[51], offset); + w[57] = amd_bytealign_S (w[49], w[50], offset); + w[56] = amd_bytealign_S (w[48], w[49], offset); + w[55] = amd_bytealign_S (w[47], w[48], offset); + w[54] = amd_bytealign_S (w[46], w[47], offset); + w[53] = amd_bytealign_S (w[45], w[46], offset); + w[52] = amd_bytealign_S (w[44], w[45], offset); + w[51] = amd_bytealign_S (w[43], w[44], offset); + w[50] = amd_bytealign_S (w[42], w[43], offset); + w[49] = amd_bytealign_S (w[41], w[42], offset); + w[48] = amd_bytealign_S (w[40], w[41], offset); + w[47] = amd_bytealign_S (w[39], w[40], offset); + w[46] = amd_bytealign_S (w[38], w[39], offset); + w[45] = amd_bytealign_S (w[37], w[38], offset); + w[44] = amd_bytealign_S (w[36], w[37], offset); + w[43] = amd_bytealign_S (w[35], w[36], offset); + w[42] = amd_bytealign_S (w[34], w[35], offset); + w[41] = amd_bytealign_S (w[33], w[34], offset); + w[40] = amd_bytealign_S (w[32], w[33], offset); + w[39] = amd_bytealign_S (w[31], w[32], offset); + w[38] = amd_bytealign_S (w[30], w[31], offset); + w[37] = amd_bytealign_S (w[29], w[30], offset); + w[36] = amd_bytealign_S (w[28], w[29], offset); + w[35] = amd_bytealign_S (w[27], w[28], offset); + w[34] = amd_bytealign_S (w[26], w[27], offset); + w[33] = amd_bytealign_S (w[25], w[26], offset); + w[32] = amd_bytealign_S (w[24], w[25], offset); + w[31] = amd_bytealign_S (w[23], w[24], offset); + w[30] = amd_bytealign_S (w[22], w[23], offset); + w[29] = amd_bytealign_S (w[21], w[22], offset); + w[28] = amd_bytealign_S (w[20], w[21], offset); + w[27] = amd_bytealign_S (w[19], w[20], offset); + w[26] = amd_bytealign_S (w[18], w[19], offset); + w[25] = amd_bytealign_S (w[17], w[18], offset); + w[24] = amd_bytealign_S (w[16], w[17], offset); + w[23] = amd_bytealign_S (w[15], w[16], offset); + w[22] = amd_bytealign_S (w[14], w[15], offset); + w[21] = amd_bytealign_S (w[13], w[14], offset); + w[20] = amd_bytealign_S (w[12], w[13], offset); + w[19] = amd_bytealign_S (w[11], w[12], offset); + w[18] = amd_bytealign_S (w[10], w[11], offset); + w[17] = amd_bytealign_S (w[ 9], w[10], offset); + w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -26600,126 +44659,65 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 8: - w[63] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[62] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[61] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[60] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[59] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[58] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[57] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[56] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[55] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[54] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[53] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[52] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[51] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[50] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[49] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[48] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[47] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[46] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[45] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[44] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[43] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[42] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[41] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[40] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[39] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[38] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[37] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[36] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[35] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[34] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[33] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[32] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[31] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[30] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[29] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[28] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[27] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[26] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[25] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[24] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[23] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[22] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[21] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[20] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[19] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[18] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[17] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[16] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[15] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[14] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[13] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[12] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[11] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[10] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[54], w[55], offset); + w[62] = amd_bytealign_S (w[53], w[54], offset); + w[61] = amd_bytealign_S (w[52], w[53], offset); + w[60] = amd_bytealign_S (w[51], w[52], offset); + w[59] = amd_bytealign_S (w[50], w[51], offset); + w[58] = amd_bytealign_S (w[49], w[50], offset); + w[57] = amd_bytealign_S (w[48], w[49], offset); + w[56] = amd_bytealign_S (w[47], w[48], offset); + w[55] = amd_bytealign_S (w[46], w[47], offset); + w[54] = amd_bytealign_S (w[45], w[46], offset); + w[53] = amd_bytealign_S (w[44], w[45], offset); + w[52] = amd_bytealign_S (w[43], w[44], offset); + w[51] = amd_bytealign_S (w[42], w[43], offset); + w[50] = amd_bytealign_S (w[41], w[42], offset); + w[49] = amd_bytealign_S (w[40], w[41], offset); + w[48] = amd_bytealign_S (w[39], w[40], offset); + w[47] = amd_bytealign_S (w[38], w[39], offset); + w[46] = amd_bytealign_S (w[37], w[38], offset); + w[45] = amd_bytealign_S (w[36], w[37], offset); + w[44] = amd_bytealign_S (w[35], w[36], offset); + w[43] = amd_bytealign_S (w[34], w[35], offset); + w[42] = amd_bytealign_S (w[33], w[34], offset); + w[41] = amd_bytealign_S (w[32], w[33], offset); + w[40] = amd_bytealign_S (w[31], w[32], offset); + w[39] = amd_bytealign_S (w[30], w[31], offset); + w[38] = amd_bytealign_S (w[29], w[30], offset); + w[37] = amd_bytealign_S (w[28], w[29], offset); + w[36] = amd_bytealign_S (w[27], w[28], offset); + w[35] = amd_bytealign_S (w[26], w[27], offset); + w[34] = amd_bytealign_S (w[25], w[26], offset); + w[33] = amd_bytealign_S (w[24], w[25], offset); + w[32] = amd_bytealign_S (w[23], w[24], offset); + w[31] = amd_bytealign_S (w[22], w[23], offset); + w[30] = amd_bytealign_S (w[21], w[22], offset); + w[29] = amd_bytealign_S (w[20], w[21], offset); + w[28] = amd_bytealign_S (w[19], w[20], offset); + w[27] = amd_bytealign_S (w[18], w[19], offset); + w[26] = amd_bytealign_S (w[17], w[18], offset); + w[25] = amd_bytealign_S (w[16], w[17], offset); + w[24] = amd_bytealign_S (w[15], w[16], offset); + w[23] = amd_bytealign_S (w[14], w[15], offset); + w[22] = amd_bytealign_S (w[13], w[14], offset); + w[21] = amd_bytealign_S (w[12], w[13], offset); + w[20] = amd_bytealign_S (w[11], w[12], offset); + w[19] = amd_bytealign_S (w[10], w[11], offset); + w[18] = amd_bytealign_S (w[ 9], w[10], offset); + w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -26729,124 +44727,64 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 9: - w[63] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[62] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[61] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[60] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[59] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[58] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[57] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[56] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[55] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[54] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[53] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[52] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[51] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[50] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[49] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[48] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[47] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[46] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[45] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[44] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[43] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[42] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[41] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[40] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[39] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[38] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[37] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[36] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[35] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[34] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[33] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[32] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[31] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[30] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[29] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[28] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[27] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[26] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[25] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[24] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[23] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[22] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[21] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[20] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[19] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[18] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[17] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[16] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[15] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[14] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[13] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[12] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[11] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[10] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[53], w[54], offset); + w[62] = amd_bytealign_S (w[52], w[53], offset); + w[61] = amd_bytealign_S (w[51], w[52], offset); + w[60] = amd_bytealign_S (w[50], w[51], offset); + w[59] = amd_bytealign_S (w[49], w[50], offset); + w[58] = amd_bytealign_S (w[48], w[49], offset); + w[57] = amd_bytealign_S (w[47], w[48], offset); + w[56] = amd_bytealign_S (w[46], w[47], offset); + w[55] = amd_bytealign_S (w[45], w[46], offset); + w[54] = amd_bytealign_S (w[44], w[45], offset); + w[53] = amd_bytealign_S (w[43], w[44], offset); + w[52] = amd_bytealign_S (w[42], w[43], offset); + w[51] = amd_bytealign_S (w[41], w[42], offset); + w[50] = amd_bytealign_S (w[40], w[41], offset); + w[49] = amd_bytealign_S (w[39], w[40], offset); + w[48] = amd_bytealign_S (w[38], w[39], offset); + w[47] = amd_bytealign_S (w[37], w[38], offset); + w[46] = amd_bytealign_S (w[36], w[37], offset); + w[45] = amd_bytealign_S (w[35], w[36], offset); + w[44] = amd_bytealign_S (w[34], w[35], offset); + w[43] = amd_bytealign_S (w[33], w[34], offset); + w[42] = amd_bytealign_S (w[32], w[33], offset); + w[41] = amd_bytealign_S (w[31], w[32], offset); + w[40] = amd_bytealign_S (w[30], w[31], offset); + w[39] = amd_bytealign_S (w[29], w[30], offset); + w[38] = amd_bytealign_S (w[28], w[29], offset); + w[37] = amd_bytealign_S (w[27], w[28], offset); + w[36] = amd_bytealign_S (w[26], w[27], offset); + w[35] = amd_bytealign_S (w[25], w[26], offset); + w[34] = amd_bytealign_S (w[24], w[25], offset); + w[33] = amd_bytealign_S (w[23], w[24], offset); + w[32] = amd_bytealign_S (w[22], w[23], offset); + w[31] = amd_bytealign_S (w[21], w[22], offset); + w[30] = amd_bytealign_S (w[20], w[21], offset); + w[29] = amd_bytealign_S (w[19], w[20], offset); + w[28] = amd_bytealign_S (w[18], w[19], offset); + w[27] = amd_bytealign_S (w[17], w[18], offset); + w[26] = amd_bytealign_S (w[16], w[17], offset); + w[25] = amd_bytealign_S (w[15], w[16], offset); + w[24] = amd_bytealign_S (w[14], w[15], offset); + w[23] = amd_bytealign_S (w[13], w[14], offset); + w[22] = amd_bytealign_S (w[12], w[13], offset); + w[21] = amd_bytealign_S (w[11], w[12], offset); + w[20] = amd_bytealign_S (w[10], w[11], offset); + w[19] = amd_bytealign_S (w[ 9], w[10], offset); + w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -26857,122 +44795,63 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 10: - w[63] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[62] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[61] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[60] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[59] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[58] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[57] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[56] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[55] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[54] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[53] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[52] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[51] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[50] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[49] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[48] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[47] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[46] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[45] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[44] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[43] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[42] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[41] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[40] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[39] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[38] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[37] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[36] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[35] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[34] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[33] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[32] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[31] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[30] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[29] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[28] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[27] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[26] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[25] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[24] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[23] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[22] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[21] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[20] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[19] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[18] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[17] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[16] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[15] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[14] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[13] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[12] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[11] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[10] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[52], w[53], offset); + w[62] = amd_bytealign_S (w[51], w[52], offset); + w[61] = amd_bytealign_S (w[50], w[51], offset); + w[60] = amd_bytealign_S (w[49], w[50], offset); + w[59] = amd_bytealign_S (w[48], w[49], offset); + w[58] = amd_bytealign_S (w[47], w[48], offset); + w[57] = amd_bytealign_S (w[46], w[47], offset); + w[56] = amd_bytealign_S (w[45], w[46], offset); + w[55] = amd_bytealign_S (w[44], w[45], offset); + w[54] = amd_bytealign_S (w[43], w[44], offset); + w[53] = amd_bytealign_S (w[42], w[43], offset); + w[52] = amd_bytealign_S (w[41], w[42], offset); + w[51] = amd_bytealign_S (w[40], w[41], offset); + w[50] = amd_bytealign_S (w[39], w[40], offset); + w[49] = amd_bytealign_S (w[38], w[39], offset); + w[48] = amd_bytealign_S (w[37], w[38], offset); + w[47] = amd_bytealign_S (w[36], w[37], offset); + w[46] = amd_bytealign_S (w[35], w[36], offset); + w[45] = amd_bytealign_S (w[34], w[35], offset); + w[44] = amd_bytealign_S (w[33], w[34], offset); + w[43] = amd_bytealign_S (w[32], w[33], offset); + w[42] = amd_bytealign_S (w[31], w[32], offset); + w[41] = amd_bytealign_S (w[30], w[31], offset); + w[40] = amd_bytealign_S (w[29], w[30], offset); + w[39] = amd_bytealign_S (w[28], w[29], offset); + w[38] = amd_bytealign_S (w[27], w[28], offset); + w[37] = amd_bytealign_S (w[26], w[27], offset); + w[36] = amd_bytealign_S (w[25], w[26], offset); + w[35] = amd_bytealign_S (w[24], w[25], offset); + w[34] = amd_bytealign_S (w[23], w[24], offset); + w[33] = amd_bytealign_S (w[22], w[23], offset); + w[32] = amd_bytealign_S (w[21], w[22], offset); + w[31] = amd_bytealign_S (w[20], w[21], offset); + w[30] = amd_bytealign_S (w[19], w[20], offset); + w[29] = amd_bytealign_S (w[18], w[19], offset); + w[28] = amd_bytealign_S (w[17], w[18], offset); + w[27] = amd_bytealign_S (w[16], w[17], offset); + w[26] = amd_bytealign_S (w[15], w[16], offset); + w[25] = amd_bytealign_S (w[14], w[15], offset); + w[24] = amd_bytealign_S (w[13], w[14], offset); + w[23] = amd_bytealign_S (w[12], w[13], offset); + w[22] = amd_bytealign_S (w[11], w[12], offset); + w[21] = amd_bytealign_S (w[10], w[11], offset); + w[20] = amd_bytealign_S (w[ 9], w[10], offset); + w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[10] = amd_bytealign_S ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -26984,120 +44863,62 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 11: - w[63] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[62] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[61] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[60] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[59] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[58] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[57] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[56] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[55] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[54] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[53] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[52] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[51] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[50] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[49] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[48] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[47] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[46] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[45] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[44] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[43] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[42] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[41] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[40] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[39] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[38] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[37] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[36] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[35] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[34] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[33] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[32] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[31] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[30] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[29] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[28] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[27] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[26] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[25] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[24] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[23] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[22] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[21] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[20] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[19] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[18] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[17] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[16] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[15] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[14] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[13] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[12] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[11] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[51], w[52], offset); + w[62] = amd_bytealign_S (w[50], w[51], offset); + w[61] = amd_bytealign_S (w[49], w[50], offset); + w[60] = amd_bytealign_S (w[48], w[49], offset); + w[59] = amd_bytealign_S (w[47], w[48], offset); + w[58] = amd_bytealign_S (w[46], w[47], offset); + w[57] = amd_bytealign_S (w[45], w[46], offset); + w[56] = amd_bytealign_S (w[44], w[45], offset); + w[55] = amd_bytealign_S (w[43], w[44], offset); + w[54] = amd_bytealign_S (w[42], w[43], offset); + w[53] = amd_bytealign_S (w[41], w[42], offset); + w[52] = amd_bytealign_S (w[40], w[41], offset); + w[51] = amd_bytealign_S (w[39], w[40], offset); + w[50] = amd_bytealign_S (w[38], w[39], offset); + w[49] = amd_bytealign_S (w[37], w[38], offset); + w[48] = amd_bytealign_S (w[36], w[37], offset); + w[47] = amd_bytealign_S (w[35], w[36], offset); + w[46] = amd_bytealign_S (w[34], w[35], offset); + w[45] = amd_bytealign_S (w[33], w[34], offset); + w[44] = amd_bytealign_S (w[32], w[33], offset); + w[43] = amd_bytealign_S (w[31], w[32], offset); + w[42] = amd_bytealign_S (w[30], w[31], offset); + w[41] = amd_bytealign_S (w[29], w[30], offset); + w[40] = amd_bytealign_S (w[28], w[29], offset); + w[39] = amd_bytealign_S (w[27], w[28], offset); + w[38] = amd_bytealign_S (w[26], w[27], offset); + w[37] = amd_bytealign_S (w[25], w[26], offset); + w[36] = amd_bytealign_S (w[24], w[25], offset); + w[35] = amd_bytealign_S (w[23], w[24], offset); + w[34] = amd_bytealign_S (w[22], w[23], offset); + w[33] = amd_bytealign_S (w[21], w[22], offset); + w[32] = amd_bytealign_S (w[20], w[21], offset); + w[31] = amd_bytealign_S (w[19], w[20], offset); + w[30] = amd_bytealign_S (w[18], w[19], offset); + w[29] = amd_bytealign_S (w[17], w[18], offset); + w[28] = amd_bytealign_S (w[16], w[17], offset); + w[27] = amd_bytealign_S (w[15], w[16], offset); + w[26] = amd_bytealign_S (w[14], w[15], offset); + w[25] = amd_bytealign_S (w[13], w[14], offset); + w[24] = amd_bytealign_S (w[12], w[13], offset); + w[23] = amd_bytealign_S (w[11], w[12], offset); + w[22] = amd_bytealign_S (w[10], w[11], offset); + w[21] = amd_bytealign_S (w[ 9], w[10], offset); + w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[11] = amd_bytealign_S ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -27110,118 +44931,61 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 12: - w[63] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[62] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[61] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[60] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[59] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[58] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[57] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[56] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[55] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[54] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[53] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[52] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[51] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[50] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[49] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[48] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[47] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[46] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[45] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[44] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[43] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[42] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[41] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[40] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[39] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[38] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[37] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[36] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[35] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[34] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[33] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[32] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[31] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[30] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[29] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[28] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[27] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[26] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[25] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[24] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[23] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[22] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[21] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[20] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[19] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[18] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[17] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[16] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[15] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[14] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[13] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[12] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[50], w[51], offset); + w[62] = amd_bytealign_S (w[49], w[50], offset); + w[61] = amd_bytealign_S (w[48], w[49], offset); + w[60] = amd_bytealign_S (w[47], w[48], offset); + w[59] = amd_bytealign_S (w[46], w[47], offset); + w[58] = amd_bytealign_S (w[45], w[46], offset); + w[57] = amd_bytealign_S (w[44], w[45], offset); + w[56] = amd_bytealign_S (w[43], w[44], offset); + w[55] = amd_bytealign_S (w[42], w[43], offset); + w[54] = amd_bytealign_S (w[41], w[42], offset); + w[53] = amd_bytealign_S (w[40], w[41], offset); + w[52] = amd_bytealign_S (w[39], w[40], offset); + w[51] = amd_bytealign_S (w[38], w[39], offset); + w[50] = amd_bytealign_S (w[37], w[38], offset); + w[49] = amd_bytealign_S (w[36], w[37], offset); + w[48] = amd_bytealign_S (w[35], w[36], offset); + w[47] = amd_bytealign_S (w[34], w[35], offset); + w[46] = amd_bytealign_S (w[33], w[34], offset); + w[45] = amd_bytealign_S (w[32], w[33], offset); + w[44] = amd_bytealign_S (w[31], w[32], offset); + w[43] = amd_bytealign_S (w[30], w[31], offset); + w[42] = amd_bytealign_S (w[29], w[30], offset); + w[41] = amd_bytealign_S (w[28], w[29], offset); + w[40] = amd_bytealign_S (w[27], w[28], offset); + w[39] = amd_bytealign_S (w[26], w[27], offset); + w[38] = amd_bytealign_S (w[25], w[26], offset); + w[37] = amd_bytealign_S (w[24], w[25], offset); + w[36] = amd_bytealign_S (w[23], w[24], offset); + w[35] = amd_bytealign_S (w[22], w[23], offset); + w[34] = amd_bytealign_S (w[21], w[22], offset); + w[33] = amd_bytealign_S (w[20], w[21], offset); + w[32] = amd_bytealign_S (w[19], w[20], offset); + w[31] = amd_bytealign_S (w[18], w[19], offset); + w[30] = amd_bytealign_S (w[17], w[18], offset); + w[29] = amd_bytealign_S (w[16], w[17], offset); + w[28] = amd_bytealign_S (w[15], w[16], offset); + w[27] = amd_bytealign_S (w[14], w[15], offset); + w[26] = amd_bytealign_S (w[13], w[14], offset); + w[25] = amd_bytealign_S (w[12], w[13], offset); + w[24] = amd_bytealign_S (w[11], w[12], offset); + w[23] = amd_bytealign_S (w[10], w[11], offset); + w[22] = amd_bytealign_S (w[ 9], w[10], offset); + w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[12] = amd_bytealign_S ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -27235,116 +44999,60 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 13: - w[63] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[62] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[61] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[60] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[59] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[58] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[57] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[56] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[55] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[54] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[53] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[52] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[51] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[50] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[49] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[48] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[47] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[46] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[45] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[44] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[43] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[42] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[41] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[40] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[39] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[38] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[37] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[36] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[35] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[34] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[33] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[32] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[31] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[30] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[29] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[28] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[27] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[26] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[25] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[24] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[23] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[22] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[21] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[20] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[19] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[18] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[17] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[16] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[15] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[14] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[13] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[49], w[50], offset); + w[62] = amd_bytealign_S (w[48], w[49], offset); + w[61] = amd_bytealign_S (w[47], w[48], offset); + w[60] = amd_bytealign_S (w[46], w[47], offset); + w[59] = amd_bytealign_S (w[45], w[46], offset); + w[58] = amd_bytealign_S (w[44], w[45], offset); + w[57] = amd_bytealign_S (w[43], w[44], offset); + w[56] = amd_bytealign_S (w[42], w[43], offset); + w[55] = amd_bytealign_S (w[41], w[42], offset); + w[54] = amd_bytealign_S (w[40], w[41], offset); + w[53] = amd_bytealign_S (w[39], w[40], offset); + w[52] = amd_bytealign_S (w[38], w[39], offset); + w[51] = amd_bytealign_S (w[37], w[38], offset); + w[50] = amd_bytealign_S (w[36], w[37], offset); + w[49] = amd_bytealign_S (w[35], w[36], offset); + w[48] = amd_bytealign_S (w[34], w[35], offset); + w[47] = amd_bytealign_S (w[33], w[34], offset); + w[46] = amd_bytealign_S (w[32], w[33], offset); + w[45] = amd_bytealign_S (w[31], w[32], offset); + w[44] = amd_bytealign_S (w[30], w[31], offset); + w[43] = amd_bytealign_S (w[29], w[30], offset); + w[42] = amd_bytealign_S (w[28], w[29], offset); + w[41] = amd_bytealign_S (w[27], w[28], offset); + w[40] = amd_bytealign_S (w[26], w[27], offset); + w[39] = amd_bytealign_S (w[25], w[26], offset); + w[38] = amd_bytealign_S (w[24], w[25], offset); + w[37] = amd_bytealign_S (w[23], w[24], offset); + w[36] = amd_bytealign_S (w[22], w[23], offset); + w[35] = amd_bytealign_S (w[21], w[22], offset); + w[34] = amd_bytealign_S (w[20], w[21], offset); + w[33] = amd_bytealign_S (w[19], w[20], offset); + w[32] = amd_bytealign_S (w[18], w[19], offset); + w[31] = amd_bytealign_S (w[17], w[18], offset); + w[30] = amd_bytealign_S (w[16], w[17], offset); + w[29] = amd_bytealign_S (w[15], w[16], offset); + w[28] = amd_bytealign_S (w[14], w[15], offset); + w[27] = amd_bytealign_S (w[13], w[14], offset); + w[26] = amd_bytealign_S (w[12], w[13], offset); + w[25] = amd_bytealign_S (w[11], w[12], offset); + w[24] = amd_bytealign_S (w[10], w[11], offset); + w[23] = amd_bytealign_S (w[ 9], w[10], offset); + w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[13] = amd_bytealign_S ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; @@ -27359,114 +45067,59 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 14: - w[63] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[62] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[61] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[60] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[59] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[58] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[57] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[56] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[55] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[54] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[53] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[52] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[51] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[50] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[49] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[48] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[47] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[46] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[45] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[44] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[43] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[42] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[41] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[40] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[39] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[38] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[37] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[36] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[35] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[34] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[33] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[32] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[31] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[30] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[29] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[28] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[27] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[26] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[25] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[24] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[23] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[22] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[21] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[20] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[19] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[18] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[17] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[16] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[15] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[14] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[48], w[49], offset); + w[62] = amd_bytealign_S (w[47], w[48], offset); + w[61] = amd_bytealign_S (w[46], w[47], offset); + w[60] = amd_bytealign_S (w[45], w[46], offset); + w[59] = amd_bytealign_S (w[44], w[45], offset); + w[58] = amd_bytealign_S (w[43], w[44], offset); + w[57] = amd_bytealign_S (w[42], w[43], offset); + w[56] = amd_bytealign_S (w[41], w[42], offset); + w[55] = amd_bytealign_S (w[40], w[41], offset); + w[54] = amd_bytealign_S (w[39], w[40], offset); + w[53] = amd_bytealign_S (w[38], w[39], offset); + w[52] = amd_bytealign_S (w[37], w[38], offset); + w[51] = amd_bytealign_S (w[36], w[37], offset); + w[50] = amd_bytealign_S (w[35], w[36], offset); + w[49] = amd_bytealign_S (w[34], w[35], offset); + w[48] = amd_bytealign_S (w[33], w[34], offset); + w[47] = amd_bytealign_S (w[32], w[33], offset); + w[46] = amd_bytealign_S (w[31], w[32], offset); + w[45] = amd_bytealign_S (w[30], w[31], offset); + w[44] = amd_bytealign_S (w[29], w[30], offset); + w[43] = amd_bytealign_S (w[28], w[29], offset); + w[42] = amd_bytealign_S (w[27], w[28], offset); + w[41] = amd_bytealign_S (w[26], w[27], offset); + w[40] = amd_bytealign_S (w[25], w[26], offset); + w[39] = amd_bytealign_S (w[24], w[25], offset); + w[38] = amd_bytealign_S (w[23], w[24], offset); + w[37] = amd_bytealign_S (w[22], w[23], offset); + w[36] = amd_bytealign_S (w[21], w[22], offset); + w[35] = amd_bytealign_S (w[20], w[21], offset); + w[34] = amd_bytealign_S (w[19], w[20], offset); + w[33] = amd_bytealign_S (w[18], w[19], offset); + w[32] = amd_bytealign_S (w[17], w[18], offset); + w[31] = amd_bytealign_S (w[16], w[17], offset); + w[30] = amd_bytealign_S (w[15], w[16], offset); + w[29] = amd_bytealign_S (w[14], w[15], offset); + w[28] = amd_bytealign_S (w[13], w[14], offset); + w[27] = amd_bytealign_S (w[12], w[13], offset); + w[26] = amd_bytealign_S (w[11], w[12], offset); + w[25] = amd_bytealign_S (w[10], w[11], offset); + w[24] = amd_bytealign_S (w[ 9], w[10], offset); + w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[14] = amd_bytealign_S ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; @@ -27482,112 +45135,58 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 15: - w[63] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[62] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[61] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[60] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[59] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[58] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[57] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[56] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[55] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[54] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[53] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[52] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[51] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[50] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[49] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[48] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[47] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[46] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[45] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[44] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[43] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[42] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[41] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[40] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[39] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[38] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[37] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[36] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[35] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[34] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[33] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[32] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[31] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[30] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[29] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[28] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[27] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[26] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[25] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[24] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[23] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[22] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[21] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[20] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[19] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[18] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[17] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[16] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[15] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[47], w[48], offset); + w[62] = amd_bytealign_S (w[46], w[47], offset); + w[61] = amd_bytealign_S (w[45], w[46], offset); + w[60] = amd_bytealign_S (w[44], w[45], offset); + w[59] = amd_bytealign_S (w[43], w[44], offset); + w[58] = amd_bytealign_S (w[42], w[43], offset); + w[57] = amd_bytealign_S (w[41], w[42], offset); + w[56] = amd_bytealign_S (w[40], w[41], offset); + w[55] = amd_bytealign_S (w[39], w[40], offset); + w[54] = amd_bytealign_S (w[38], w[39], offset); + w[53] = amd_bytealign_S (w[37], w[38], offset); + w[52] = amd_bytealign_S (w[36], w[37], offset); + w[51] = amd_bytealign_S (w[35], w[36], offset); + w[50] = amd_bytealign_S (w[34], w[35], offset); + w[49] = amd_bytealign_S (w[33], w[34], offset); + w[48] = amd_bytealign_S (w[32], w[33], offset); + w[47] = amd_bytealign_S (w[31], w[32], offset); + w[46] = amd_bytealign_S (w[30], w[31], offset); + w[45] = amd_bytealign_S (w[29], w[30], offset); + w[44] = amd_bytealign_S (w[28], w[29], offset); + w[43] = amd_bytealign_S (w[27], w[28], offset); + w[42] = amd_bytealign_S (w[26], w[27], offset); + w[41] = amd_bytealign_S (w[25], w[26], offset); + w[40] = amd_bytealign_S (w[24], w[25], offset); + w[39] = amd_bytealign_S (w[23], w[24], offset); + w[38] = amd_bytealign_S (w[22], w[23], offset); + w[37] = amd_bytealign_S (w[21], w[22], offset); + w[36] = amd_bytealign_S (w[20], w[21], offset); + w[35] = amd_bytealign_S (w[19], w[20], offset); + w[34] = amd_bytealign_S (w[18], w[19], offset); + w[33] = amd_bytealign_S (w[17], w[18], offset); + w[32] = amd_bytealign_S (w[16], w[17], offset); + w[31] = amd_bytealign_S (w[15], w[16], offset); + w[30] = amd_bytealign_S (w[14], w[15], offset); + w[29] = amd_bytealign_S (w[13], w[14], offset); + w[28] = amd_bytealign_S (w[12], w[13], offset); + w[27] = amd_bytealign_S (w[11], w[12], offset); + w[26] = amd_bytealign_S (w[10], w[11], offset); + w[25] = amd_bytealign_S (w[ 9], w[10], offset); + w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[15] = amd_bytealign_S ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; @@ -27604,110 +45203,57 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 16: - w[63] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[62] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[61] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[60] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[59] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[58] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[57] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[56] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[55] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[54] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[53] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[52] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[51] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[50] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[49] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[48] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[47] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[46] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[45] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[44] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[43] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[42] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[41] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[40] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[39] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[38] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[37] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[36] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[35] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[34] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[33] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[32] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[31] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[30] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[29] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[28] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[27] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[26] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[25] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[24] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[23] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[22] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[21] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[20] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[19] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[18] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[17] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[16] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[46], w[47], offset); + w[62] = amd_bytealign_S (w[45], w[46], offset); + w[61] = amd_bytealign_S (w[44], w[45], offset); + w[60] = amd_bytealign_S (w[43], w[44], offset); + w[59] = amd_bytealign_S (w[42], w[43], offset); + w[58] = amd_bytealign_S (w[41], w[42], offset); + w[57] = amd_bytealign_S (w[40], w[41], offset); + w[56] = amd_bytealign_S (w[39], w[40], offset); + w[55] = amd_bytealign_S (w[38], w[39], offset); + w[54] = amd_bytealign_S (w[37], w[38], offset); + w[53] = amd_bytealign_S (w[36], w[37], offset); + w[52] = amd_bytealign_S (w[35], w[36], offset); + w[51] = amd_bytealign_S (w[34], w[35], offset); + w[50] = amd_bytealign_S (w[33], w[34], offset); + w[49] = amd_bytealign_S (w[32], w[33], offset); + w[48] = amd_bytealign_S (w[31], w[32], offset); + w[47] = amd_bytealign_S (w[30], w[31], offset); + w[46] = amd_bytealign_S (w[29], w[30], offset); + w[45] = amd_bytealign_S (w[28], w[29], offset); + w[44] = amd_bytealign_S (w[27], w[28], offset); + w[43] = amd_bytealign_S (w[26], w[27], offset); + w[42] = amd_bytealign_S (w[25], w[26], offset); + w[41] = amd_bytealign_S (w[24], w[25], offset); + w[40] = amd_bytealign_S (w[23], w[24], offset); + w[39] = amd_bytealign_S (w[22], w[23], offset); + w[38] = amd_bytealign_S (w[21], w[22], offset); + w[37] = amd_bytealign_S (w[20], w[21], offset); + w[36] = amd_bytealign_S (w[19], w[20], offset); + w[35] = amd_bytealign_S (w[18], w[19], offset); + w[34] = amd_bytealign_S (w[17], w[18], offset); + w[33] = amd_bytealign_S (w[16], w[17], offset); + w[32] = amd_bytealign_S (w[15], w[16], offset); + w[31] = amd_bytealign_S (w[14], w[15], offset); + w[30] = amd_bytealign_S (w[13], w[14], offset); + w[29] = amd_bytealign_S (w[12], w[13], offset); + w[28] = amd_bytealign_S (w[11], w[12], offset); + w[27] = amd_bytealign_S (w[10], w[11], offset); + w[26] = amd_bytealign_S (w[ 9], w[10], offset); + w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[16] = amd_bytealign_S ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; @@ -27725,108 +45271,56 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 17: - w[63] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[62] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[61] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[60] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[59] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[58] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[57] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[56] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[55] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[54] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[53] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[52] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[51] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[50] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[49] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[48] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[47] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[46] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[45] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[44] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[43] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[42] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[41] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[40] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[39] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[38] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[37] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[36] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[35] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[34] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[33] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[32] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[31] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[30] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[29] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[28] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[27] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[26] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[25] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[24] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[23] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[22] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[21] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[20] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[19] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[18] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[17] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[45], w[46], offset); + w[62] = amd_bytealign_S (w[44], w[45], offset); + w[61] = amd_bytealign_S (w[43], w[44], offset); + w[60] = amd_bytealign_S (w[42], w[43], offset); + w[59] = amd_bytealign_S (w[41], w[42], offset); + w[58] = amd_bytealign_S (w[40], w[41], offset); + w[57] = amd_bytealign_S (w[39], w[40], offset); + w[56] = amd_bytealign_S (w[38], w[39], offset); + w[55] = amd_bytealign_S (w[37], w[38], offset); + w[54] = amd_bytealign_S (w[36], w[37], offset); + w[53] = amd_bytealign_S (w[35], w[36], offset); + w[52] = amd_bytealign_S (w[34], w[35], offset); + w[51] = amd_bytealign_S (w[33], w[34], offset); + w[50] = amd_bytealign_S (w[32], w[33], offset); + w[49] = amd_bytealign_S (w[31], w[32], offset); + w[48] = amd_bytealign_S (w[30], w[31], offset); + w[47] = amd_bytealign_S (w[29], w[30], offset); + w[46] = amd_bytealign_S (w[28], w[29], offset); + w[45] = amd_bytealign_S (w[27], w[28], offset); + w[44] = amd_bytealign_S (w[26], w[27], offset); + w[43] = amd_bytealign_S (w[25], w[26], offset); + w[42] = amd_bytealign_S (w[24], w[25], offset); + w[41] = amd_bytealign_S (w[23], w[24], offset); + w[40] = amd_bytealign_S (w[22], w[23], offset); + w[39] = amd_bytealign_S (w[21], w[22], offset); + w[38] = amd_bytealign_S (w[20], w[21], offset); + w[37] = amd_bytealign_S (w[19], w[20], offset); + w[36] = amd_bytealign_S (w[18], w[19], offset); + w[35] = amd_bytealign_S (w[17], w[18], offset); + w[34] = amd_bytealign_S (w[16], w[17], offset); + w[33] = amd_bytealign_S (w[15], w[16], offset); + w[32] = amd_bytealign_S (w[14], w[15], offset); + w[31] = amd_bytealign_S (w[13], w[14], offset); + w[30] = amd_bytealign_S (w[12], w[13], offset); + w[29] = amd_bytealign_S (w[11], w[12], offset); + w[28] = amd_bytealign_S (w[10], w[11], offset); + w[27] = amd_bytealign_S (w[ 9], w[10], offset); + w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[17] = amd_bytealign_S ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; @@ -27845,106 +45339,55 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 18: - w[63] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[62] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[61] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[60] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[59] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[58] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[57] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[56] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[55] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[54] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[53] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[52] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[51] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[50] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[49] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[48] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[47] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[46] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[45] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[44] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[43] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[42] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[41] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[40] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[39] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[38] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[37] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[36] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[35] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[34] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[33] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[32] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[31] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[30] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[29] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[28] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[27] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[26] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[25] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[24] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[23] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[22] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[21] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[20] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[19] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[18] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[44], w[45], offset); + w[62] = amd_bytealign_S (w[43], w[44], offset); + w[61] = amd_bytealign_S (w[42], w[43], offset); + w[60] = amd_bytealign_S (w[41], w[42], offset); + w[59] = amd_bytealign_S (w[40], w[41], offset); + w[58] = amd_bytealign_S (w[39], w[40], offset); + w[57] = amd_bytealign_S (w[38], w[39], offset); + w[56] = amd_bytealign_S (w[37], w[38], offset); + w[55] = amd_bytealign_S (w[36], w[37], offset); + w[54] = amd_bytealign_S (w[35], w[36], offset); + w[53] = amd_bytealign_S (w[34], w[35], offset); + w[52] = amd_bytealign_S (w[33], w[34], offset); + w[51] = amd_bytealign_S (w[32], w[33], offset); + w[50] = amd_bytealign_S (w[31], w[32], offset); + w[49] = amd_bytealign_S (w[30], w[31], offset); + w[48] = amd_bytealign_S (w[29], w[30], offset); + w[47] = amd_bytealign_S (w[28], w[29], offset); + w[46] = amd_bytealign_S (w[27], w[28], offset); + w[45] = amd_bytealign_S (w[26], w[27], offset); + w[44] = amd_bytealign_S (w[25], w[26], offset); + w[43] = amd_bytealign_S (w[24], w[25], offset); + w[42] = amd_bytealign_S (w[23], w[24], offset); + w[41] = amd_bytealign_S (w[22], w[23], offset); + w[40] = amd_bytealign_S (w[21], w[22], offset); + w[39] = amd_bytealign_S (w[20], w[21], offset); + w[38] = amd_bytealign_S (w[19], w[20], offset); + w[37] = amd_bytealign_S (w[18], w[19], offset); + w[36] = amd_bytealign_S (w[17], w[18], offset); + w[35] = amd_bytealign_S (w[16], w[17], offset); + w[34] = amd_bytealign_S (w[15], w[16], offset); + w[33] = amd_bytealign_S (w[14], w[15], offset); + w[32] = amd_bytealign_S (w[13], w[14], offset); + w[31] = amd_bytealign_S (w[12], w[13], offset); + w[30] = amd_bytealign_S (w[11], w[12], offset); + w[29] = amd_bytealign_S (w[10], w[11], offset); + w[28] = amd_bytealign_S (w[ 9], w[10], offset); + w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[18] = amd_bytealign_S ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; @@ -27964,104 +45407,54 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 19: - w[63] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[62] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[61] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[60] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[59] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[58] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[57] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[56] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[55] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[54] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[53] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[52] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[51] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[50] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[49] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[48] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[47] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[46] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[45] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[44] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[43] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[42] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[41] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[40] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[39] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[38] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[37] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[36] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[35] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[34] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[33] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[32] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[31] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[30] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[29] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[28] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[27] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[26] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[25] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[24] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[23] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[22] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[21] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[20] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[19] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[43], w[44], offset); + w[62] = amd_bytealign_S (w[42], w[43], offset); + w[61] = amd_bytealign_S (w[41], w[42], offset); + w[60] = amd_bytealign_S (w[40], w[41], offset); + w[59] = amd_bytealign_S (w[39], w[40], offset); + w[58] = amd_bytealign_S (w[38], w[39], offset); + w[57] = amd_bytealign_S (w[37], w[38], offset); + w[56] = amd_bytealign_S (w[36], w[37], offset); + w[55] = amd_bytealign_S (w[35], w[36], offset); + w[54] = amd_bytealign_S (w[34], w[35], offset); + w[53] = amd_bytealign_S (w[33], w[34], offset); + w[52] = amd_bytealign_S (w[32], w[33], offset); + w[51] = amd_bytealign_S (w[31], w[32], offset); + w[50] = amd_bytealign_S (w[30], w[31], offset); + w[49] = amd_bytealign_S (w[29], w[30], offset); + w[48] = amd_bytealign_S (w[28], w[29], offset); + w[47] = amd_bytealign_S (w[27], w[28], offset); + w[46] = amd_bytealign_S (w[26], w[27], offset); + w[45] = amd_bytealign_S (w[25], w[26], offset); + w[44] = amd_bytealign_S (w[24], w[25], offset); + w[43] = amd_bytealign_S (w[23], w[24], offset); + w[42] = amd_bytealign_S (w[22], w[23], offset); + w[41] = amd_bytealign_S (w[21], w[22], offset); + w[40] = amd_bytealign_S (w[20], w[21], offset); + w[39] = amd_bytealign_S (w[19], w[20], offset); + w[38] = amd_bytealign_S (w[18], w[19], offset); + w[37] = amd_bytealign_S (w[17], w[18], offset); + w[36] = amd_bytealign_S (w[16], w[17], offset); + w[35] = amd_bytealign_S (w[15], w[16], offset); + w[34] = amd_bytealign_S (w[14], w[15], offset); + w[33] = amd_bytealign_S (w[13], w[14], offset); + w[32] = amd_bytealign_S (w[12], w[13], offset); + w[31] = amd_bytealign_S (w[11], w[12], offset); + w[30] = amd_bytealign_S (w[10], w[11], offset); + w[29] = amd_bytealign_S (w[ 9], w[10], offset); + w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[19] = amd_bytealign_S ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; @@ -28082,102 +45475,53 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 20: - w[63] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[62] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[61] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[60] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[59] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[58] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[57] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[56] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[55] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[54] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[53] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[52] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[51] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[50] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[49] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[48] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[47] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[46] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[45] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[44] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[43] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[42] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[41] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[40] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[39] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[38] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[37] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[36] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[35] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[34] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[33] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[32] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[31] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[30] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[29] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[28] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[27] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[26] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[25] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[24] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[23] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[22] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[21] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[20] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[42], w[43], offset); + w[62] = amd_bytealign_S (w[41], w[42], offset); + w[61] = amd_bytealign_S (w[40], w[41], offset); + w[60] = amd_bytealign_S (w[39], w[40], offset); + w[59] = amd_bytealign_S (w[38], w[39], offset); + w[58] = amd_bytealign_S (w[37], w[38], offset); + w[57] = amd_bytealign_S (w[36], w[37], offset); + w[56] = amd_bytealign_S (w[35], w[36], offset); + w[55] = amd_bytealign_S (w[34], w[35], offset); + w[54] = amd_bytealign_S (w[33], w[34], offset); + w[53] = amd_bytealign_S (w[32], w[33], offset); + w[52] = amd_bytealign_S (w[31], w[32], offset); + w[51] = amd_bytealign_S (w[30], w[31], offset); + w[50] = amd_bytealign_S (w[29], w[30], offset); + w[49] = amd_bytealign_S (w[28], w[29], offset); + w[48] = amd_bytealign_S (w[27], w[28], offset); + w[47] = amd_bytealign_S (w[26], w[27], offset); + w[46] = amd_bytealign_S (w[25], w[26], offset); + w[45] = amd_bytealign_S (w[24], w[25], offset); + w[44] = amd_bytealign_S (w[23], w[24], offset); + w[43] = amd_bytealign_S (w[22], w[23], offset); + w[42] = amd_bytealign_S (w[21], w[22], offset); + w[41] = amd_bytealign_S (w[20], w[21], offset); + w[40] = amd_bytealign_S (w[19], w[20], offset); + w[39] = amd_bytealign_S (w[18], w[19], offset); + w[38] = amd_bytealign_S (w[17], w[18], offset); + w[37] = amd_bytealign_S (w[16], w[17], offset); + w[36] = amd_bytealign_S (w[15], w[16], offset); + w[35] = amd_bytealign_S (w[14], w[15], offset); + w[34] = amd_bytealign_S (w[13], w[14], offset); + w[33] = amd_bytealign_S (w[12], w[13], offset); + w[32] = amd_bytealign_S (w[11], w[12], offset); + w[31] = amd_bytealign_S (w[10], w[11], offset); + w[30] = amd_bytealign_S (w[ 9], w[10], offset); + w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[20] = amd_bytealign_S ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; @@ -28199,100 +45543,52 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 21: - w[63] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[62] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[61] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[60] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[59] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[58] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[57] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[56] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[55] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[54] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[53] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[52] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[51] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[50] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[49] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[48] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[47] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[46] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[45] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[44] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[43] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[42] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[41] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[40] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[39] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[38] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[37] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[36] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[35] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[34] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[33] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[32] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[31] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[30] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[29] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[28] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[27] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[26] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[25] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[24] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[23] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[22] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[21] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[41], w[42], offset); + w[62] = amd_bytealign_S (w[40], w[41], offset); + w[61] = amd_bytealign_S (w[39], w[40], offset); + w[60] = amd_bytealign_S (w[38], w[39], offset); + w[59] = amd_bytealign_S (w[37], w[38], offset); + w[58] = amd_bytealign_S (w[36], w[37], offset); + w[57] = amd_bytealign_S (w[35], w[36], offset); + w[56] = amd_bytealign_S (w[34], w[35], offset); + w[55] = amd_bytealign_S (w[33], w[34], offset); + w[54] = amd_bytealign_S (w[32], w[33], offset); + w[53] = amd_bytealign_S (w[31], w[32], offset); + w[52] = amd_bytealign_S (w[30], w[31], offset); + w[51] = amd_bytealign_S (w[29], w[30], offset); + w[50] = amd_bytealign_S (w[28], w[29], offset); + w[49] = amd_bytealign_S (w[27], w[28], offset); + w[48] = amd_bytealign_S (w[26], w[27], offset); + w[47] = amd_bytealign_S (w[25], w[26], offset); + w[46] = amd_bytealign_S (w[24], w[25], offset); + w[45] = amd_bytealign_S (w[23], w[24], offset); + w[44] = amd_bytealign_S (w[22], w[23], offset); + w[43] = amd_bytealign_S (w[21], w[22], offset); + w[42] = amd_bytealign_S (w[20], w[21], offset); + w[41] = amd_bytealign_S (w[19], w[20], offset); + w[40] = amd_bytealign_S (w[18], w[19], offset); + w[39] = amd_bytealign_S (w[17], w[18], offset); + w[38] = amd_bytealign_S (w[16], w[17], offset); + w[37] = amd_bytealign_S (w[15], w[16], offset); + w[36] = amd_bytealign_S (w[14], w[15], offset); + w[35] = amd_bytealign_S (w[13], w[14], offset); + w[34] = amd_bytealign_S (w[12], w[13], offset); + w[33] = amd_bytealign_S (w[11], w[12], offset); + w[32] = amd_bytealign_S (w[10], w[11], offset); + w[31] = amd_bytealign_S (w[ 9], w[10], offset); + w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[21] = amd_bytealign_S ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; @@ -28315,98 +45611,51 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 22: - w[63] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[62] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[61] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[60] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[59] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[58] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[57] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[56] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[55] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[54] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[53] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[52] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[51] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[50] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[49] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[48] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[47] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[46] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[45] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[44] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[43] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[42] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[41] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[40] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[39] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[38] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[37] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[36] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[35] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[34] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[33] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[32] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[31] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[30] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[29] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[28] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[27] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[26] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[25] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[24] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[23] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[22] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[40], w[41], offset); + w[62] = amd_bytealign_S (w[39], w[40], offset); + w[61] = amd_bytealign_S (w[38], w[39], offset); + w[60] = amd_bytealign_S (w[37], w[38], offset); + w[59] = amd_bytealign_S (w[36], w[37], offset); + w[58] = amd_bytealign_S (w[35], w[36], offset); + w[57] = amd_bytealign_S (w[34], w[35], offset); + w[56] = amd_bytealign_S (w[33], w[34], offset); + w[55] = amd_bytealign_S (w[32], w[33], offset); + w[54] = amd_bytealign_S (w[31], w[32], offset); + w[53] = amd_bytealign_S (w[30], w[31], offset); + w[52] = amd_bytealign_S (w[29], w[30], offset); + w[51] = amd_bytealign_S (w[28], w[29], offset); + w[50] = amd_bytealign_S (w[27], w[28], offset); + w[49] = amd_bytealign_S (w[26], w[27], offset); + w[48] = amd_bytealign_S (w[25], w[26], offset); + w[47] = amd_bytealign_S (w[24], w[25], offset); + w[46] = amd_bytealign_S (w[23], w[24], offset); + w[45] = amd_bytealign_S (w[22], w[23], offset); + w[44] = amd_bytealign_S (w[21], w[22], offset); + w[43] = amd_bytealign_S (w[20], w[21], offset); + w[42] = amd_bytealign_S (w[19], w[20], offset); + w[41] = amd_bytealign_S (w[18], w[19], offset); + w[40] = amd_bytealign_S (w[17], w[18], offset); + w[39] = amd_bytealign_S (w[16], w[17], offset); + w[38] = amd_bytealign_S (w[15], w[16], offset); + w[37] = amd_bytealign_S (w[14], w[15], offset); + w[36] = amd_bytealign_S (w[13], w[14], offset); + w[35] = amd_bytealign_S (w[12], w[13], offset); + w[34] = amd_bytealign_S (w[11], w[12], offset); + w[33] = amd_bytealign_S (w[10], w[11], offset); + w[32] = amd_bytealign_S (w[ 9], w[10], offset); + w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[22] = amd_bytealign_S ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; @@ -28430,96 +45679,50 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 23: - w[63] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[62] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[61] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[60] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[59] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[58] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[57] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[56] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[55] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[54] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[53] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[52] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[51] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[50] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[49] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[48] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[47] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[46] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[45] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[44] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[43] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[42] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[41] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[40] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[39] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[38] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[37] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[36] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[35] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[34] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[33] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[32] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[31] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[30] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[29] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[28] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[27] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[26] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[25] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[24] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[23] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[39], w[40], offset); + w[62] = amd_bytealign_S (w[38], w[39], offset); + w[61] = amd_bytealign_S (w[37], w[38], offset); + w[60] = amd_bytealign_S (w[36], w[37], offset); + w[59] = amd_bytealign_S (w[35], w[36], offset); + w[58] = amd_bytealign_S (w[34], w[35], offset); + w[57] = amd_bytealign_S (w[33], w[34], offset); + w[56] = amd_bytealign_S (w[32], w[33], offset); + w[55] = amd_bytealign_S (w[31], w[32], offset); + w[54] = amd_bytealign_S (w[30], w[31], offset); + w[53] = amd_bytealign_S (w[29], w[30], offset); + w[52] = amd_bytealign_S (w[28], w[29], offset); + w[51] = amd_bytealign_S (w[27], w[28], offset); + w[50] = amd_bytealign_S (w[26], w[27], offset); + w[49] = amd_bytealign_S (w[25], w[26], offset); + w[48] = amd_bytealign_S (w[24], w[25], offset); + w[47] = amd_bytealign_S (w[23], w[24], offset); + w[46] = amd_bytealign_S (w[22], w[23], offset); + w[45] = amd_bytealign_S (w[21], w[22], offset); + w[44] = amd_bytealign_S (w[20], w[21], offset); + w[43] = amd_bytealign_S (w[19], w[20], offset); + w[42] = amd_bytealign_S (w[18], w[19], offset); + w[41] = amd_bytealign_S (w[17], w[18], offset); + w[40] = amd_bytealign_S (w[16], w[17], offset); + w[39] = amd_bytealign_S (w[15], w[16], offset); + w[38] = amd_bytealign_S (w[14], w[15], offset); + w[37] = amd_bytealign_S (w[13], w[14], offset); + w[36] = amd_bytealign_S (w[12], w[13], offset); + w[35] = amd_bytealign_S (w[11], w[12], offset); + w[34] = amd_bytealign_S (w[10], w[11], offset); + w[33] = amd_bytealign_S (w[ 9], w[10], offset); + w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[23] = amd_bytealign_S ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; @@ -28544,94 +45747,49 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 24: - w[63] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[62] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[61] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[60] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[59] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[58] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[57] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[56] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[55] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[54] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[53] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[52] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[51] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[50] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[49] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[48] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[47] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[46] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[45] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[44] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[43] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[42] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[41] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[40] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[39] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[38] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[37] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[36] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[35] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[34] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[33] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[32] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[31] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[30] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[29] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[28] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[27] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[26] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[25] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[24] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[38], w[39], offset); + w[62] = amd_bytealign_S (w[37], w[38], offset); + w[61] = amd_bytealign_S (w[36], w[37], offset); + w[60] = amd_bytealign_S (w[35], w[36], offset); + w[59] = amd_bytealign_S (w[34], w[35], offset); + w[58] = amd_bytealign_S (w[33], w[34], offset); + w[57] = amd_bytealign_S (w[32], w[33], offset); + w[56] = amd_bytealign_S (w[31], w[32], offset); + w[55] = amd_bytealign_S (w[30], w[31], offset); + w[54] = amd_bytealign_S (w[29], w[30], offset); + w[53] = amd_bytealign_S (w[28], w[29], offset); + w[52] = amd_bytealign_S (w[27], w[28], offset); + w[51] = amd_bytealign_S (w[26], w[27], offset); + w[50] = amd_bytealign_S (w[25], w[26], offset); + w[49] = amd_bytealign_S (w[24], w[25], offset); + w[48] = amd_bytealign_S (w[23], w[24], offset); + w[47] = amd_bytealign_S (w[22], w[23], offset); + w[46] = amd_bytealign_S (w[21], w[22], offset); + w[45] = amd_bytealign_S (w[20], w[21], offset); + w[44] = amd_bytealign_S (w[19], w[20], offset); + w[43] = amd_bytealign_S (w[18], w[19], offset); + w[42] = amd_bytealign_S (w[17], w[18], offset); + w[41] = amd_bytealign_S (w[16], w[17], offset); + w[40] = amd_bytealign_S (w[15], w[16], offset); + w[39] = amd_bytealign_S (w[14], w[15], offset); + w[38] = amd_bytealign_S (w[13], w[14], offset); + w[37] = amd_bytealign_S (w[12], w[13], offset); + w[36] = amd_bytealign_S (w[11], w[12], offset); + w[35] = amd_bytealign_S (w[10], w[11], offset); + w[34] = amd_bytealign_S (w[ 9], w[10], offset); + w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[24] = amd_bytealign_S ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; @@ -28657,92 +45815,48 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 25: - w[63] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[62] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[61] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[60] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[59] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[58] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[57] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[56] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[55] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[54] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[53] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[52] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[51] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[50] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[49] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[48] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[47] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[46] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[45] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[44] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[43] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[42] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[41] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[40] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[39] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[38] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[37] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[36] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[35] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[34] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[33] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[32] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[31] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[30] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[29] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[28] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[27] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[26] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[25] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[37], w[38], offset); + w[62] = amd_bytealign_S (w[36], w[37], offset); + w[61] = amd_bytealign_S (w[35], w[36], offset); + w[60] = amd_bytealign_S (w[34], w[35], offset); + w[59] = amd_bytealign_S (w[33], w[34], offset); + w[58] = amd_bytealign_S (w[32], w[33], offset); + w[57] = amd_bytealign_S (w[31], w[32], offset); + w[56] = amd_bytealign_S (w[30], w[31], offset); + w[55] = amd_bytealign_S (w[29], w[30], offset); + w[54] = amd_bytealign_S (w[28], w[29], offset); + w[53] = amd_bytealign_S (w[27], w[28], offset); + w[52] = amd_bytealign_S (w[26], w[27], offset); + w[51] = amd_bytealign_S (w[25], w[26], offset); + w[50] = amd_bytealign_S (w[24], w[25], offset); + w[49] = amd_bytealign_S (w[23], w[24], offset); + w[48] = amd_bytealign_S (w[22], w[23], offset); + w[47] = amd_bytealign_S (w[21], w[22], offset); + w[46] = amd_bytealign_S (w[20], w[21], offset); + w[45] = amd_bytealign_S (w[19], w[20], offset); + w[44] = amd_bytealign_S (w[18], w[19], offset); + w[43] = amd_bytealign_S (w[17], w[18], offset); + w[42] = amd_bytealign_S (w[16], w[17], offset); + w[41] = amd_bytealign_S (w[15], w[16], offset); + w[40] = amd_bytealign_S (w[14], w[15], offset); + w[39] = amd_bytealign_S (w[13], w[14], offset); + w[38] = amd_bytealign_S (w[12], w[13], offset); + w[37] = amd_bytealign_S (w[11], w[12], offset); + w[36] = amd_bytealign_S (w[10], w[11], offset); + w[35] = amd_bytealign_S (w[ 9], w[10], offset); + w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[25] = amd_bytealign_S ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; @@ -28769,90 +45883,47 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 26: - w[63] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[62] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[61] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[60] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[59] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[58] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[57] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[56] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[55] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[54] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[53] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[52] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[51] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[50] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[49] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[48] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[47] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[46] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[45] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[44] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[43] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[42] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[41] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[40] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[39] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[38] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[37] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[36] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[35] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[34] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[33] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[32] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[31] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[30] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[29] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[28] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[27] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[26] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[36], w[37], offset); + w[62] = amd_bytealign_S (w[35], w[36], offset); + w[61] = amd_bytealign_S (w[34], w[35], offset); + w[60] = amd_bytealign_S (w[33], w[34], offset); + w[59] = amd_bytealign_S (w[32], w[33], offset); + w[58] = amd_bytealign_S (w[31], w[32], offset); + w[57] = amd_bytealign_S (w[30], w[31], offset); + w[56] = amd_bytealign_S (w[29], w[30], offset); + w[55] = amd_bytealign_S (w[28], w[29], offset); + w[54] = amd_bytealign_S (w[27], w[28], offset); + w[53] = amd_bytealign_S (w[26], w[27], offset); + w[52] = amd_bytealign_S (w[25], w[26], offset); + w[51] = amd_bytealign_S (w[24], w[25], offset); + w[50] = amd_bytealign_S (w[23], w[24], offset); + w[49] = amd_bytealign_S (w[22], w[23], offset); + w[48] = amd_bytealign_S (w[21], w[22], offset); + w[47] = amd_bytealign_S (w[20], w[21], offset); + w[46] = amd_bytealign_S (w[19], w[20], offset); + w[45] = amd_bytealign_S (w[18], w[19], offset); + w[44] = amd_bytealign_S (w[17], w[18], offset); + w[43] = amd_bytealign_S (w[16], w[17], offset); + w[42] = amd_bytealign_S (w[15], w[16], offset); + w[41] = amd_bytealign_S (w[14], w[15], offset); + w[40] = amd_bytealign_S (w[13], w[14], offset); + w[39] = amd_bytealign_S (w[12], w[13], offset); + w[38] = amd_bytealign_S (w[11], w[12], offset); + w[37] = amd_bytealign_S (w[10], w[11], offset); + w[36] = amd_bytealign_S (w[ 9], w[10], offset); + w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[26] = amd_bytealign_S ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; @@ -28880,88 +45951,46 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 27: - w[63] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[62] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[61] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[60] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[59] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[58] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[57] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[56] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[55] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[54] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[53] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[52] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[51] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[50] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[49] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[48] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[47] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[46] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[45] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[44] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[43] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[42] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[41] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[40] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[39] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[38] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[37] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[36] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[35] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[34] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[33] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[32] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[31] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[30] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[29] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[28] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[27] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[35], w[36], offset); + w[62] = amd_bytealign_S (w[34], w[35], offset); + w[61] = amd_bytealign_S (w[33], w[34], offset); + w[60] = amd_bytealign_S (w[32], w[33], offset); + w[59] = amd_bytealign_S (w[31], w[32], offset); + w[58] = amd_bytealign_S (w[30], w[31], offset); + w[57] = amd_bytealign_S (w[29], w[30], offset); + w[56] = amd_bytealign_S (w[28], w[29], offset); + w[55] = amd_bytealign_S (w[27], w[28], offset); + w[54] = amd_bytealign_S (w[26], w[27], offset); + w[53] = amd_bytealign_S (w[25], w[26], offset); + w[52] = amd_bytealign_S (w[24], w[25], offset); + w[51] = amd_bytealign_S (w[23], w[24], offset); + w[50] = amd_bytealign_S (w[22], w[23], offset); + w[49] = amd_bytealign_S (w[21], w[22], offset); + w[48] = amd_bytealign_S (w[20], w[21], offset); + w[47] = amd_bytealign_S (w[19], w[20], offset); + w[46] = amd_bytealign_S (w[18], w[19], offset); + w[45] = amd_bytealign_S (w[17], w[18], offset); + w[44] = amd_bytealign_S (w[16], w[17], offset); + w[43] = amd_bytealign_S (w[15], w[16], offset); + w[42] = amd_bytealign_S (w[14], w[15], offset); + w[41] = amd_bytealign_S (w[13], w[14], offset); + w[40] = amd_bytealign_S (w[12], w[13], offset); + w[39] = amd_bytealign_S (w[11], w[12], offset); + w[38] = amd_bytealign_S (w[10], w[11], offset); + w[37] = amd_bytealign_S (w[ 9], w[10], offset); + w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[27] = amd_bytealign_S ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; @@ -28990,86 +46019,45 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 28: - w[63] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[62] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[61] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[60] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[59] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[58] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[57] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[56] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[55] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[54] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[53] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[52] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[51] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[50] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[49] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[48] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[47] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[46] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[45] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[44] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[43] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[42] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[41] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[40] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[39] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[38] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[37] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[36] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[35] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[34] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[33] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[32] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[31] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[30] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[29] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[28] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[34], w[35], offset); + w[62] = amd_bytealign_S (w[33], w[34], offset); + w[61] = amd_bytealign_S (w[32], w[33], offset); + w[60] = amd_bytealign_S (w[31], w[32], offset); + w[59] = amd_bytealign_S (w[30], w[31], offset); + w[58] = amd_bytealign_S (w[29], w[30], offset); + w[57] = amd_bytealign_S (w[28], w[29], offset); + w[56] = amd_bytealign_S (w[27], w[28], offset); + w[55] = amd_bytealign_S (w[26], w[27], offset); + w[54] = amd_bytealign_S (w[25], w[26], offset); + w[53] = amd_bytealign_S (w[24], w[25], offset); + w[52] = amd_bytealign_S (w[23], w[24], offset); + w[51] = amd_bytealign_S (w[22], w[23], offset); + w[50] = amd_bytealign_S (w[21], w[22], offset); + w[49] = amd_bytealign_S (w[20], w[21], offset); + w[48] = amd_bytealign_S (w[19], w[20], offset); + w[47] = amd_bytealign_S (w[18], w[19], offset); + w[46] = amd_bytealign_S (w[17], w[18], offset); + w[45] = amd_bytealign_S (w[16], w[17], offset); + w[44] = amd_bytealign_S (w[15], w[16], offset); + w[43] = amd_bytealign_S (w[14], w[15], offset); + w[42] = amd_bytealign_S (w[13], w[14], offset); + w[41] = amd_bytealign_S (w[12], w[13], offset); + w[40] = amd_bytealign_S (w[11], w[12], offset); + w[39] = amd_bytealign_S (w[10], w[11], offset); + w[38] = amd_bytealign_S (w[ 9], w[10], offset); + w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[28] = amd_bytealign_S ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; @@ -29099,84 +46087,44 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 29: - w[63] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[62] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[61] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[60] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[59] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[58] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[57] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[56] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[55] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[54] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[53] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[52] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[51] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[50] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[49] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[48] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[47] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[46] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[45] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[44] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[43] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[42] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[41] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[40] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[39] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[38] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[37] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[36] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[35] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[34] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[33] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[32] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[31] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[30] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[29] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[33], w[34], offset); + w[62] = amd_bytealign_S (w[32], w[33], offset); + w[61] = amd_bytealign_S (w[31], w[32], offset); + w[60] = amd_bytealign_S (w[30], w[31], offset); + w[59] = amd_bytealign_S (w[29], w[30], offset); + w[58] = amd_bytealign_S (w[28], w[29], offset); + w[57] = amd_bytealign_S (w[27], w[28], offset); + w[56] = amd_bytealign_S (w[26], w[27], offset); + w[55] = amd_bytealign_S (w[25], w[26], offset); + w[54] = amd_bytealign_S (w[24], w[25], offset); + w[53] = amd_bytealign_S (w[23], w[24], offset); + w[52] = amd_bytealign_S (w[22], w[23], offset); + w[51] = amd_bytealign_S (w[21], w[22], offset); + w[50] = amd_bytealign_S (w[20], w[21], offset); + w[49] = amd_bytealign_S (w[19], w[20], offset); + w[48] = amd_bytealign_S (w[18], w[19], offset); + w[47] = amd_bytealign_S (w[17], w[18], offset); + w[46] = amd_bytealign_S (w[16], w[17], offset); + w[45] = amd_bytealign_S (w[15], w[16], offset); + w[44] = amd_bytealign_S (w[14], w[15], offset); + w[43] = amd_bytealign_S (w[13], w[14], offset); + w[42] = amd_bytealign_S (w[12], w[13], offset); + w[41] = amd_bytealign_S (w[11], w[12], offset); + w[40] = amd_bytealign_S (w[10], w[11], offset); + w[39] = amd_bytealign_S (w[ 9], w[10], offset); + w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[29] = amd_bytealign_S ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; @@ -29207,82 +46155,43 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 30: - w[63] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[62] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[61] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[60] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[59] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[58] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[57] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[56] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[55] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[54] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[53] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[52] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[51] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[50] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[49] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[48] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[47] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[46] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[45] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[44] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[43] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[42] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[41] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[40] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[39] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[38] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[37] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[36] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[35] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[34] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[33] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[32] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[31] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[30] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[32], w[33], offset); + w[62] = amd_bytealign_S (w[31], w[32], offset); + w[61] = amd_bytealign_S (w[30], w[31], offset); + w[60] = amd_bytealign_S (w[29], w[30], offset); + w[59] = amd_bytealign_S (w[28], w[29], offset); + w[58] = amd_bytealign_S (w[27], w[28], offset); + w[57] = amd_bytealign_S (w[26], w[27], offset); + w[56] = amd_bytealign_S (w[25], w[26], offset); + w[55] = amd_bytealign_S (w[24], w[25], offset); + w[54] = amd_bytealign_S (w[23], w[24], offset); + w[53] = amd_bytealign_S (w[22], w[23], offset); + w[52] = amd_bytealign_S (w[21], w[22], offset); + w[51] = amd_bytealign_S (w[20], w[21], offset); + w[50] = amd_bytealign_S (w[19], w[20], offset); + w[49] = amd_bytealign_S (w[18], w[19], offset); + w[48] = amd_bytealign_S (w[17], w[18], offset); + w[47] = amd_bytealign_S (w[16], w[17], offset); + w[46] = amd_bytealign_S (w[15], w[16], offset); + w[45] = amd_bytealign_S (w[14], w[15], offset); + w[44] = amd_bytealign_S (w[13], w[14], offset); + w[43] = amd_bytealign_S (w[12], w[13], offset); + w[42] = amd_bytealign_S (w[11], w[12], offset); + w[41] = amd_bytealign_S (w[10], w[11], offset); + w[40] = amd_bytealign_S (w[ 9], w[10], offset); + w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[30] = amd_bytealign_S ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; @@ -29314,80 +46223,42 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 31: - w[63] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[62] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[61] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[60] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[59] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[58] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[57] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[56] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[55] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[54] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[53] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[52] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[51] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[50] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[49] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[48] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[47] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[46] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[45] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[44] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[43] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[42] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[41] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[40] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[39] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[38] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[37] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[36] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[35] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[34] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[33] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[32] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[31] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[31], w[32], offset); + w[62] = amd_bytealign_S (w[30], w[31], offset); + w[61] = amd_bytealign_S (w[29], w[30], offset); + w[60] = amd_bytealign_S (w[28], w[29], offset); + w[59] = amd_bytealign_S (w[27], w[28], offset); + w[58] = amd_bytealign_S (w[26], w[27], offset); + w[57] = amd_bytealign_S (w[25], w[26], offset); + w[56] = amd_bytealign_S (w[24], w[25], offset); + w[55] = amd_bytealign_S (w[23], w[24], offset); + w[54] = amd_bytealign_S (w[22], w[23], offset); + w[53] = amd_bytealign_S (w[21], w[22], offset); + w[52] = amd_bytealign_S (w[20], w[21], offset); + w[51] = amd_bytealign_S (w[19], w[20], offset); + w[50] = amd_bytealign_S (w[18], w[19], offset); + w[49] = amd_bytealign_S (w[17], w[18], offset); + w[48] = amd_bytealign_S (w[16], w[17], offset); + w[47] = amd_bytealign_S (w[15], w[16], offset); + w[46] = amd_bytealign_S (w[14], w[15], offset); + w[45] = amd_bytealign_S (w[13], w[14], offset); + w[44] = amd_bytealign_S (w[12], w[13], offset); + w[43] = amd_bytealign_S (w[11], w[12], offset); + w[42] = amd_bytealign_S (w[10], w[11], offset); + w[41] = amd_bytealign_S (w[ 9], w[10], offset); + w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[31] = amd_bytealign_S ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; @@ -29420,78 +46291,41 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 32: - w[63] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[62] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[61] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[60] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[59] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[58] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[57] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[56] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[55] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[54] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[53] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[52] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[51] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[50] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[49] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[48] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[47] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[46] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[45] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[44] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[43] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[42] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[41] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[40] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[39] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[38] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[37] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[36] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[35] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[34] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[33] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[32] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[30], w[31], offset); + w[62] = amd_bytealign_S (w[29], w[30], offset); + w[61] = amd_bytealign_S (w[28], w[29], offset); + w[60] = amd_bytealign_S (w[27], w[28], offset); + w[59] = amd_bytealign_S (w[26], w[27], offset); + w[58] = amd_bytealign_S (w[25], w[26], offset); + w[57] = amd_bytealign_S (w[24], w[25], offset); + w[56] = amd_bytealign_S (w[23], w[24], offset); + w[55] = amd_bytealign_S (w[22], w[23], offset); + w[54] = amd_bytealign_S (w[21], w[22], offset); + w[53] = amd_bytealign_S (w[20], w[21], offset); + w[52] = amd_bytealign_S (w[19], w[20], offset); + w[51] = amd_bytealign_S (w[18], w[19], offset); + w[50] = amd_bytealign_S (w[17], w[18], offset); + w[49] = amd_bytealign_S (w[16], w[17], offset); + w[48] = amd_bytealign_S (w[15], w[16], offset); + w[47] = amd_bytealign_S (w[14], w[15], offset); + w[46] = amd_bytealign_S (w[13], w[14], offset); + w[45] = amd_bytealign_S (w[12], w[13], offset); + w[44] = amd_bytealign_S (w[11], w[12], offset); + w[43] = amd_bytealign_S (w[10], w[11], offset); + w[42] = amd_bytealign_S (w[ 9], w[10], offset); + w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[32] = amd_bytealign_S ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; @@ -29525,76 +46359,40 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 33: - w[63] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[62] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[61] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[60] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[59] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[58] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[57] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[56] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[55] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[54] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[53] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[52] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[51] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[50] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[49] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[48] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[47] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[46] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[45] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[44] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[43] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[42] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[41] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[40] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[39] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[38] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[37] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[36] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[35] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[34] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[33] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[29], w[30], offset); + w[62] = amd_bytealign_S (w[28], w[29], offset); + w[61] = amd_bytealign_S (w[27], w[28], offset); + w[60] = amd_bytealign_S (w[26], w[27], offset); + w[59] = amd_bytealign_S (w[25], w[26], offset); + w[58] = amd_bytealign_S (w[24], w[25], offset); + w[57] = amd_bytealign_S (w[23], w[24], offset); + w[56] = amd_bytealign_S (w[22], w[23], offset); + w[55] = amd_bytealign_S (w[21], w[22], offset); + w[54] = amd_bytealign_S (w[20], w[21], offset); + w[53] = amd_bytealign_S (w[19], w[20], offset); + w[52] = amd_bytealign_S (w[18], w[19], offset); + w[51] = amd_bytealign_S (w[17], w[18], offset); + w[50] = amd_bytealign_S (w[16], w[17], offset); + w[49] = amd_bytealign_S (w[15], w[16], offset); + w[48] = amd_bytealign_S (w[14], w[15], offset); + w[47] = amd_bytealign_S (w[13], w[14], offset); + w[46] = amd_bytealign_S (w[12], w[13], offset); + w[45] = amd_bytealign_S (w[11], w[12], offset); + w[44] = amd_bytealign_S (w[10], w[11], offset); + w[43] = amd_bytealign_S (w[ 9], w[10], offset); + w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[33] = amd_bytealign_S ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; @@ -29629,74 +46427,39 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 34: - w[63] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[62] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[61] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[60] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[59] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[58] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[57] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[56] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[55] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[54] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[53] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[52] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[51] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[50] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[49] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[48] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[47] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[46] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[45] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[44] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[43] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[42] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[41] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[40] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[39] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[38] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[37] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[36] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[35] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[34] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[28], w[29], offset); + w[62] = amd_bytealign_S (w[27], w[28], offset); + w[61] = amd_bytealign_S (w[26], w[27], offset); + w[60] = amd_bytealign_S (w[25], w[26], offset); + w[59] = amd_bytealign_S (w[24], w[25], offset); + w[58] = amd_bytealign_S (w[23], w[24], offset); + w[57] = amd_bytealign_S (w[22], w[23], offset); + w[56] = amd_bytealign_S (w[21], w[22], offset); + w[55] = amd_bytealign_S (w[20], w[21], offset); + w[54] = amd_bytealign_S (w[19], w[20], offset); + w[53] = amd_bytealign_S (w[18], w[19], offset); + w[52] = amd_bytealign_S (w[17], w[18], offset); + w[51] = amd_bytealign_S (w[16], w[17], offset); + w[50] = amd_bytealign_S (w[15], w[16], offset); + w[49] = amd_bytealign_S (w[14], w[15], offset); + w[48] = amd_bytealign_S (w[13], w[14], offset); + w[47] = amd_bytealign_S (w[12], w[13], offset); + w[46] = amd_bytealign_S (w[11], w[12], offset); + w[45] = amd_bytealign_S (w[10], w[11], offset); + w[44] = amd_bytealign_S (w[ 9], w[10], offset); + w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[34] = amd_bytealign_S ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; @@ -29732,72 +46495,38 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 35: - w[63] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[62] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[61] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[60] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[59] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[58] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[57] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[56] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[55] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[54] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[53] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[52] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[51] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[50] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[49] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[48] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[47] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[46] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[45] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[44] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[43] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[42] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[41] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[40] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[39] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[38] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[37] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[36] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[35] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[27], w[28], offset); + w[62] = amd_bytealign_S (w[26], w[27], offset); + w[61] = amd_bytealign_S (w[25], w[26], offset); + w[60] = amd_bytealign_S (w[24], w[25], offset); + w[59] = amd_bytealign_S (w[23], w[24], offset); + w[58] = amd_bytealign_S (w[22], w[23], offset); + w[57] = amd_bytealign_S (w[21], w[22], offset); + w[56] = amd_bytealign_S (w[20], w[21], offset); + w[55] = amd_bytealign_S (w[19], w[20], offset); + w[54] = amd_bytealign_S (w[18], w[19], offset); + w[53] = amd_bytealign_S (w[17], w[18], offset); + w[52] = amd_bytealign_S (w[16], w[17], offset); + w[51] = amd_bytealign_S (w[15], w[16], offset); + w[50] = amd_bytealign_S (w[14], w[15], offset); + w[49] = amd_bytealign_S (w[13], w[14], offset); + w[48] = amd_bytealign_S (w[12], w[13], offset); + w[47] = amd_bytealign_S (w[11], w[12], offset); + w[46] = amd_bytealign_S (w[10], w[11], offset); + w[45] = amd_bytealign_S (w[ 9], w[10], offset); + w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[35] = amd_bytealign_S ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; @@ -29834,70 +46563,37 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 36: - w[63] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[62] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[61] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[60] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[59] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[58] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[57] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[56] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[55] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[54] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[53] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[52] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[51] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[50] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[49] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[48] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[47] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[46] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[45] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[44] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[43] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[42] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[41] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[40] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[39] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[38] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[37] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[36] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[26], w[27], offset); + w[62] = amd_bytealign_S (w[25], w[26], offset); + w[61] = amd_bytealign_S (w[24], w[25], offset); + w[60] = amd_bytealign_S (w[23], w[24], offset); + w[59] = amd_bytealign_S (w[22], w[23], offset); + w[58] = amd_bytealign_S (w[21], w[22], offset); + w[57] = amd_bytealign_S (w[20], w[21], offset); + w[56] = amd_bytealign_S (w[19], w[20], offset); + w[55] = amd_bytealign_S (w[18], w[19], offset); + w[54] = amd_bytealign_S (w[17], w[18], offset); + w[53] = amd_bytealign_S (w[16], w[17], offset); + w[52] = amd_bytealign_S (w[15], w[16], offset); + w[51] = amd_bytealign_S (w[14], w[15], offset); + w[50] = amd_bytealign_S (w[13], w[14], offset); + w[49] = amd_bytealign_S (w[12], w[13], offset); + w[48] = amd_bytealign_S (w[11], w[12], offset); + w[47] = amd_bytealign_S (w[10], w[11], offset); + w[46] = amd_bytealign_S (w[ 9], w[10], offset); + w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[36] = amd_bytealign_S ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; @@ -29935,68 +46631,36 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 37: - w[63] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[62] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[61] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[60] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[59] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[58] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[57] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[56] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[55] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[54] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[53] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[52] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[51] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[50] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[49] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[48] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[47] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[46] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[45] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[44] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[43] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[42] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[41] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[40] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[39] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[38] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[37] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[25], w[26], offset); + w[62] = amd_bytealign_S (w[24], w[25], offset); + w[61] = amd_bytealign_S (w[23], w[24], offset); + w[60] = amd_bytealign_S (w[22], w[23], offset); + w[59] = amd_bytealign_S (w[21], w[22], offset); + w[58] = amd_bytealign_S (w[20], w[21], offset); + w[57] = amd_bytealign_S (w[19], w[20], offset); + w[56] = amd_bytealign_S (w[18], w[19], offset); + w[55] = amd_bytealign_S (w[17], w[18], offset); + w[54] = amd_bytealign_S (w[16], w[17], offset); + w[53] = amd_bytealign_S (w[15], w[16], offset); + w[52] = amd_bytealign_S (w[14], w[15], offset); + w[51] = amd_bytealign_S (w[13], w[14], offset); + w[50] = amd_bytealign_S (w[12], w[13], offset); + w[49] = amd_bytealign_S (w[11], w[12], offset); + w[48] = amd_bytealign_S (w[10], w[11], offset); + w[47] = amd_bytealign_S (w[ 9], w[10], offset); + w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[37] = amd_bytealign_S ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; @@ -30035,66 +46699,35 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 38: - w[63] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[62] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[61] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[60] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[59] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[58] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[57] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[56] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[55] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[54] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[53] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[52] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[51] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[50] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[49] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[48] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[47] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[46] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[45] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[44] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[43] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[42] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[41] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[40] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[39] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[38] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[24], w[25], offset); + w[62] = amd_bytealign_S (w[23], w[24], offset); + w[61] = amd_bytealign_S (w[22], w[23], offset); + w[60] = amd_bytealign_S (w[21], w[22], offset); + w[59] = amd_bytealign_S (w[20], w[21], offset); + w[58] = amd_bytealign_S (w[19], w[20], offset); + w[57] = amd_bytealign_S (w[18], w[19], offset); + w[56] = amd_bytealign_S (w[17], w[18], offset); + w[55] = amd_bytealign_S (w[16], w[17], offset); + w[54] = amd_bytealign_S (w[15], w[16], offset); + w[53] = amd_bytealign_S (w[14], w[15], offset); + w[52] = amd_bytealign_S (w[13], w[14], offset); + w[51] = amd_bytealign_S (w[12], w[13], offset); + w[50] = amd_bytealign_S (w[11], w[12], offset); + w[49] = amd_bytealign_S (w[10], w[11], offset); + w[48] = amd_bytealign_S (w[ 9], w[10], offset); + w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[38] = amd_bytealign_S ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; @@ -30134,64 +46767,34 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 39: - w[63] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[62] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[61] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[60] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[59] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[58] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[57] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[56] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[55] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[54] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[53] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[52] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[51] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[50] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[49] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[48] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[47] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[46] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[45] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[44] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[43] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[42] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[41] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[40] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[39] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[23], w[24], offset); + w[62] = amd_bytealign_S (w[22], w[23], offset); + w[61] = amd_bytealign_S (w[21], w[22], offset); + w[60] = amd_bytealign_S (w[20], w[21], offset); + w[59] = amd_bytealign_S (w[19], w[20], offset); + w[58] = amd_bytealign_S (w[18], w[19], offset); + w[57] = amd_bytealign_S (w[17], w[18], offset); + w[56] = amd_bytealign_S (w[16], w[17], offset); + w[55] = amd_bytealign_S (w[15], w[16], offset); + w[54] = amd_bytealign_S (w[14], w[15], offset); + w[53] = amd_bytealign_S (w[13], w[14], offset); + w[52] = amd_bytealign_S (w[12], w[13], offset); + w[51] = amd_bytealign_S (w[11], w[12], offset); + w[50] = amd_bytealign_S (w[10], w[11], offset); + w[49] = amd_bytealign_S (w[ 9], w[10], offset); + w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[39] = amd_bytealign_S ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; @@ -30232,62 +46835,33 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 40: - w[63] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[62] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[61] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[60] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[59] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[58] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[57] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[56] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[55] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[54] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[53] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[52] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[51] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[50] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[49] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[48] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[47] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[46] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[45] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[44] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[43] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[42] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[41] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[40] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[22], w[23], offset); + w[62] = amd_bytealign_S (w[21], w[22], offset); + w[61] = amd_bytealign_S (w[20], w[21], offset); + w[60] = amd_bytealign_S (w[19], w[20], offset); + w[59] = amd_bytealign_S (w[18], w[19], offset); + w[58] = amd_bytealign_S (w[17], w[18], offset); + w[57] = amd_bytealign_S (w[16], w[17], offset); + w[56] = amd_bytealign_S (w[15], w[16], offset); + w[55] = amd_bytealign_S (w[14], w[15], offset); + w[54] = amd_bytealign_S (w[13], w[14], offset); + w[53] = amd_bytealign_S (w[12], w[13], offset); + w[52] = amd_bytealign_S (w[11], w[12], offset); + w[51] = amd_bytealign_S (w[10], w[11], offset); + w[50] = amd_bytealign_S (w[ 9], w[10], offset); + w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[40] = amd_bytealign_S ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; @@ -30329,60 +46903,32 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 41: - w[63] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[62] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[61] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[60] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[59] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[58] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[57] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[56] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[55] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[54] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[53] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[52] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[51] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[50] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[49] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[48] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[47] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[46] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[45] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[44] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[43] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[42] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[41] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[21], w[22], offset); + w[62] = amd_bytealign_S (w[20], w[21], offset); + w[61] = amd_bytealign_S (w[19], w[20], offset); + w[60] = amd_bytealign_S (w[18], w[19], offset); + w[59] = amd_bytealign_S (w[17], w[18], offset); + w[58] = amd_bytealign_S (w[16], w[17], offset); + w[57] = amd_bytealign_S (w[15], w[16], offset); + w[56] = amd_bytealign_S (w[14], w[15], offset); + w[55] = amd_bytealign_S (w[13], w[14], offset); + w[54] = amd_bytealign_S (w[12], w[13], offset); + w[53] = amd_bytealign_S (w[11], w[12], offset); + w[52] = amd_bytealign_S (w[10], w[11], offset); + w[51] = amd_bytealign_S (w[ 9], w[10], offset); + w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[41] = amd_bytealign_S ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; @@ -30425,58 +46971,31 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 42: - w[63] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[62] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[61] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[60] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[59] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[58] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[57] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[56] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[55] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[54] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[53] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[52] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[51] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[50] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[49] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[48] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[47] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[46] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[45] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[44] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[43] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[42] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[20], w[21], offset); + w[62] = amd_bytealign_S (w[19], w[20], offset); + w[61] = amd_bytealign_S (w[18], w[19], offset); + w[60] = amd_bytealign_S (w[17], w[18], offset); + w[59] = amd_bytealign_S (w[16], w[17], offset); + w[58] = amd_bytealign_S (w[15], w[16], offset); + w[57] = amd_bytealign_S (w[14], w[15], offset); + w[56] = amd_bytealign_S (w[13], w[14], offset); + w[55] = amd_bytealign_S (w[12], w[13], offset); + w[54] = amd_bytealign_S (w[11], w[12], offset); + w[53] = amd_bytealign_S (w[10], w[11], offset); + w[52] = amd_bytealign_S (w[ 9], w[10], offset); + w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[42] = amd_bytealign_S ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; @@ -30520,56 +47039,30 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 43: - w[63] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[62] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[61] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[60] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[59] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[58] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[57] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[56] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[55] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[54] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[53] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[52] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[51] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[50] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[49] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[48] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[47] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[46] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[45] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[44] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[43] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[19], w[20], offset); + w[62] = amd_bytealign_S (w[18], w[19], offset); + w[61] = amd_bytealign_S (w[17], w[18], offset); + w[60] = amd_bytealign_S (w[16], w[17], offset); + w[59] = amd_bytealign_S (w[15], w[16], offset); + w[58] = amd_bytealign_S (w[14], w[15], offset); + w[57] = amd_bytealign_S (w[13], w[14], offset); + w[56] = amd_bytealign_S (w[12], w[13], offset); + w[55] = amd_bytealign_S (w[11], w[12], offset); + w[54] = amd_bytealign_S (w[10], w[11], offset); + w[53] = amd_bytealign_S (w[ 9], w[10], offset); + w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[43] = amd_bytealign_S ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; @@ -30614,54 +47107,29 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 44: - w[63] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[62] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[61] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[60] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[59] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[58] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[57] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[56] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[55] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[54] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[53] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[52] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[51] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[50] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[49] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[48] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[47] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[46] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[45] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[44] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[18], w[19], offset); + w[62] = amd_bytealign_S (w[17], w[18], offset); + w[61] = amd_bytealign_S (w[16], w[17], offset); + w[60] = amd_bytealign_S (w[15], w[16], offset); + w[59] = amd_bytealign_S (w[14], w[15], offset); + w[58] = amd_bytealign_S (w[13], w[14], offset); + w[57] = amd_bytealign_S (w[12], w[13], offset); + w[56] = amd_bytealign_S (w[11], w[12], offset); + w[55] = amd_bytealign_S (w[10], w[11], offset); + w[54] = amd_bytealign_S (w[ 9], w[10], offset); + w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[44] = amd_bytealign_S ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; @@ -30707,52 +47175,28 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 45: - w[63] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[62] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[61] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[60] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[59] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[58] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[57] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[56] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[55] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[54] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[53] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[52] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[51] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[50] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[49] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[48] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[47] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[46] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[45] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[17], w[18], offset); + w[62] = amd_bytealign_S (w[16], w[17], offset); + w[61] = amd_bytealign_S (w[15], w[16], offset); + w[60] = amd_bytealign_S (w[14], w[15], offset); + w[59] = amd_bytealign_S (w[13], w[14], offset); + w[58] = amd_bytealign_S (w[12], w[13], offset); + w[57] = amd_bytealign_S (w[11], w[12], offset); + w[56] = amd_bytealign_S (w[10], w[11], offset); + w[55] = amd_bytealign_S (w[ 9], w[10], offset); + w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[45] = amd_bytealign_S ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; @@ -30799,50 +47243,27 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 46: - w[63] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[62] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[61] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[60] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[59] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[58] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[57] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[56] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[55] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[54] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[53] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[52] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[51] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[50] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[49] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[48] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[47] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[46] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[16], w[17], offset); + w[62] = amd_bytealign_S (w[15], w[16], offset); + w[61] = amd_bytealign_S (w[14], w[15], offset); + w[60] = amd_bytealign_S (w[13], w[14], offset); + w[59] = amd_bytealign_S (w[12], w[13], offset); + w[58] = amd_bytealign_S (w[11], w[12], offset); + w[57] = amd_bytealign_S (w[10], w[11], offset); + w[56] = amd_bytealign_S (w[ 9], w[10], offset); + w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[46] = amd_bytealign_S ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; @@ -30890,48 +47311,26 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 47: - w[63] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[62] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[61] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[60] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[59] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[58] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[57] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[56] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[55] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[54] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[53] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[52] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[51] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[50] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[49] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[48] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[47] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[15], w[16], offset); + w[62] = amd_bytealign_S (w[14], w[15], offset); + w[61] = amd_bytealign_S (w[13], w[14], offset); + w[60] = amd_bytealign_S (w[12], w[13], offset); + w[59] = amd_bytealign_S (w[11], w[12], offset); + w[58] = amd_bytealign_S (w[10], w[11], offset); + w[57] = amd_bytealign_S (w[ 9], w[10], offset); + w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[47] = amd_bytealign_S ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; @@ -30980,46 +47379,25 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 48: - w[63] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[62] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[61] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[60] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[59] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[58] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[57] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[56] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[55] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[54] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[53] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[52] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[51] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[50] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[49] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[48] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[14], w[15], offset); + w[62] = amd_bytealign_S (w[13], w[14], offset); + w[61] = amd_bytealign_S (w[12], w[13], offset); + w[60] = amd_bytealign_S (w[11], w[12], offset); + w[59] = amd_bytealign_S (w[10], w[11], offset); + w[58] = amd_bytealign_S (w[ 9], w[10], offset); + w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[48] = amd_bytealign_S ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; @@ -31069,44 +47447,24 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 49: - w[63] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[62] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[61] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[60] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[59] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[58] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[57] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[56] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[55] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[54] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[53] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[52] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[51] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[50] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[49] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[13], w[14], offset); + w[62] = amd_bytealign_S (w[12], w[13], offset); + w[61] = amd_bytealign_S (w[11], w[12], offset); + w[60] = amd_bytealign_S (w[10], w[11], offset); + w[59] = amd_bytealign_S (w[ 9], w[10], offset); + w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[49] = amd_bytealign_S ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; @@ -31157,42 +47515,23 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 50: - w[63] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[62] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[61] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[60] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[59] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[58] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[57] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[56] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[55] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[54] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[53] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[52] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[51] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[50] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[12], w[13], offset); + w[62] = amd_bytealign_S (w[11], w[12], offset); + w[61] = amd_bytealign_S (w[10], w[11], offset); + w[60] = amd_bytealign_S (w[ 9], w[10], offset); + w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[50] = amd_bytealign_S ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; @@ -31244,40 +47583,22 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 51: - w[63] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[62] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[61] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[60] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[59] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[58] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[57] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[56] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[55] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[54] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[53] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[52] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[51] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[11], w[12], offset); + w[62] = amd_bytealign_S (w[10], w[11], offset); + w[61] = amd_bytealign_S (w[ 9], w[10], offset); + w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[51] = amd_bytealign_S ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; @@ -31330,38 +47651,21 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 52: - w[63] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[62] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[61] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[60] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[59] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[58] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[57] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[56] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[55] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[54] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[53] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[52] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[10], w[11], offset); + w[62] = amd_bytealign_S (w[ 9], w[10], offset); + w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[52] = amd_bytealign_S ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; @@ -31415,36 +47719,20 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 53: - w[63] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[62] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[61] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[60] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[59] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[58] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[57] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[56] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[55] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[54] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[53] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 9], w[10], offset); + w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[53] = amd_bytealign_S ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; @@ -31499,34 +47787,19 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 54: - w[63] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[62] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[61] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[60] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[59] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[58] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[57] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[56] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[55] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[54] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[54] = amd_bytealign_S ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; @@ -31582,32 +47855,18 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 55: - w[63] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[62] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[61] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[60] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[59] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[58] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[57] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[56] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[55] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[55] = amd_bytealign_S ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; @@ -31664,30 +47923,17 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 56: - w[63] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[62] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[61] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[60] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[59] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[58] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[57] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[56] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[56] = amd_bytealign_S ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; @@ -31745,28 +47991,16 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 57: - w[63] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[62] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[61] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[60] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[59] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[58] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[57] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[57] = amd_bytealign_S ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; @@ -31825,26 +48059,15 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 58: - w[63] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[62] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[61] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[60] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[59] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[58] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[58] = amd_bytealign_S ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; @@ -31904,24 +48127,14 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 59: - w[63] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[62] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[61] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[60] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[59] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[59] = amd_bytealign_S ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; @@ -31982,22 +48195,13 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 60: - w[63] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[62] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[61] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[60] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[60] = amd_bytealign_S ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; @@ -32059,20 +48263,12 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 61: - w[63] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[62] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[61] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[61] = amd_bytealign_S ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; @@ -32135,18 +48331,11 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 62: - w[63] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[62] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[62] = amd_bytealign_S ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; @@ -32210,16 +48399,10 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[62] = w[63]; - w[63] = 0; - } - break; case 63: - w[63] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; @@ -32284,18 +48467,15 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[63] = 0; - } - break; } + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); + #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -36655,7 +52835,7 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) #endif } -inline void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) +void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -45438,7 +61618,7 @@ inline void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) PACKSV4 (s6, v6, e); \ PACKSV4 (s7, v7, e); -inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) +void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45498,7 +61678,7 @@ inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) +void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45678,7 +61858,7 @@ inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4 #endif } -inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) +void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45736,7 +61916,7 @@ inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) #endif } -inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) +void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45794,7 +61974,7 @@ inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) #endif } -inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) +void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 diff --git a/OpenCL/inc_hash_md4.cl b/OpenCL/inc_hash_md4.cl index 668d0bbc9..5bc5d5978 100644 --- a/OpenCL/inc_hash_md4.cl +++ b/OpenCL/inc_hash_md4.cl @@ -111,7 +111,7 @@ void md4_init (md4_ctx_t *ctx) void md4_update_64 (md4_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1047,6 +1047,7 @@ void md4_hmac_update_utf16le_swap (md4_hmac_ctx_t *ctx, const u32 *w, const int { md4_update_utf16le_swap (&ctx->ipad, w, len); } + void md4_hmac_update_global (md4_hmac_ctx_t *ctx, const __global u32 *w, const int len) { md4_update_global (&ctx->ipad, w, len); @@ -1234,7 +1235,7 @@ void md4_init_vector_from_scalar (md4_ctx_vector_t *ctx, md4_ctx_t *ctx0) void md4_update_vector_64 (md4_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_md5.cl b/OpenCL/inc_hash_md5.cl index 926bbb2c4..95e06cbef 100644 --- a/OpenCL/inc_hash_md5.cl +++ b/OpenCL/inc_hash_md5.cl @@ -145,7 +145,7 @@ void md5_init (md5_ctx_t *ctx) void md5_update_64 (md5_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1303,7 +1303,7 @@ void md5_init_vector_from_scalar (md5_ctx_vector_t *ctx, md5_ctx_t *ctx0) void md5_update_vector_64 (md5_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_ripemd160.cl b/OpenCL/inc_hash_ripemd160.cl index bf5d2ec42..709ad3eb2 100644 --- a/OpenCL/inc_hash_ripemd160.cl +++ b/OpenCL/inc_hash_ripemd160.cl @@ -245,7 +245,7 @@ void ripemd160_init (ripemd160_ctx_t *ctx) void ripemd160_update_64 (ripemd160_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1504,7 +1504,7 @@ void ripemd160_init_vector_from_scalar (ripemd160_ctx_vector_t *ctx, ripemd160_c void ripemd160_update_vector_64 (ripemd160_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha1.cl b/OpenCL/inc_hash_sha1.cl index 9713a02dd..47fe4691d 100644 --- a/OpenCL/inc_hash_sha1.cl +++ b/OpenCL/inc_hash_sha1.cl @@ -177,7 +177,7 @@ void sha1_init (sha1_ctx_t *ctx) void sha1_update_64 (sha1_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1368,7 +1368,7 @@ void sha1_init_vector_from_scalar (sha1_ctx_vector_t *ctx, sha1_ctx_t *ctx0) void sha1_update_vector_64 (sha1_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha224.cl b/OpenCL/inc_hash_sha224.cl index 4f35938a6..553397f6c 100644 --- a/OpenCL/inc_hash_sha224.cl +++ b/OpenCL/inc_hash_sha224.cl @@ -162,7 +162,7 @@ void sha224_init (sha224_ctx_t *ctx) void sha224_update_64 (sha224_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1321,7 +1321,7 @@ void sha224_init_vector_from_scalar (sha224_ctx_vector_t *ctx, sha224_ctx_t *ctx void sha224_update_vector_64 (sha224_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha256.cl b/OpenCL/inc_hash_sha256.cl index 75fd99acf..92b35b579 100644 --- a/OpenCL/inc_hash_sha256.cl +++ b/OpenCL/inc_hash_sha256.cl @@ -162,7 +162,7 @@ void sha256_init (sha256_ctx_t *ctx) void sha256_update_64 (sha256_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1321,7 +1321,7 @@ void sha256_init_vector_from_scalar (sha256_ctx_vector_t *ctx, sha256_ctx_t *ctx void sha256_update_vector_64 (sha256_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha384.cl b/OpenCL/inc_hash_sha384.cl index 8302cd379..0800b253a 100644 --- a/OpenCL/inc_hash_sha384.cl +++ b/OpenCL/inc_hash_sha384.cl @@ -186,7 +186,7 @@ void sha384_init (sha384_ctx_t *ctx) void sha384_update_128 (sha384_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif @@ -2017,7 +2017,7 @@ void sha384_init_vector_from_scalar (sha384_ctx_vector_t *ctx, sha384_ctx_t *ctx void sha384_update_vector_128 (sha384_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif diff --git a/OpenCL/inc_hash_sha512.cl b/OpenCL/inc_hash_sha512.cl index 6c58834eb..61c6e143d 100644 --- a/OpenCL/inc_hash_sha512.cl +++ b/OpenCL/inc_hash_sha512.cl @@ -186,7 +186,7 @@ void sha512_init (sha512_ctx_t *ctx) void sha512_update_128 (sha512_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif @@ -2017,7 +2017,7 @@ void sha512_init_vector_from_scalar (sha512_ctx_vector_t *ctx, sha512_ctx_t *ctx void sha512_update_vector_128 (sha512_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif diff --git a/OpenCL/inc_hash_whirlpool.cl b/OpenCL/inc_hash_whirlpool.cl index a983cefb7..1ec270105 100644 --- a/OpenCL/inc_hash_whirlpool.cl +++ b/OpenCL/inc_hash_whirlpool.cl @@ -1345,7 +1345,7 @@ void whirlpool_init (whirlpool_ctx_t *ctx, __local u32 (*s_Ch)[256], __local u32 void whirlpool_update_64 (whirlpool_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -2608,7 +2608,7 @@ void whirlpool_init_vector_from_scalar (whirlpool_ctx_vector_t *ctx, whirlpool_c void whirlpool_update_vector_64 (whirlpool_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index 71c926d92..74ca84cc2 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -inline u32 generate_cmask (const u32 value) +u32 generate_cmask (const u32 value) { const u32 rmask = ((value & 0x40404040u) >> 1u) & ~((value & 0x80808080u) >> 2u); @@ -14,7 +14,7 @@ inline u32 generate_cmask (const u32 value) return rmask & ~hmask & lmask; } -inline void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) +void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) { const u32 tmp = (1u << ((offset & 3u) * 8u)) - 1u; @@ -67,7 +67,7 @@ inline void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) } } -inline void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) +void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) { const u32 tmp = ~((1u << ((offset & 3u) * 8u)) - 1u); @@ -120,7 +120,7 @@ inline void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) } } -inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) +void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { out0[0] = amd_bytealign_S (in0[1], in0[0], 1); out0[1] = amd_bytealign_S (in0[2], in0[1], 1); @@ -132,7 +132,7 @@ inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 o out1[3] = amd_bytealign_S ( 0, in1[3], 1); } -inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) +void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { out1[3] = amd_bytealign_S (in1[3], in1[2], 3); out1[2] = amd_bytealign_S (in1[2], in1[1], 3); @@ -144,7 +144,7 @@ inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 o out0[0] = amd_bytealign_S (in0[0], 0, 3); } -inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) +void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) { switch (num) { @@ -439,7 +439,7 @@ inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) +void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) { switch (num) { @@ -734,7 +734,7 @@ inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -inline void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0) +void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0) { // this version works with 1 byte append only @@ -754,12 +754,11 @@ inline void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 buf1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) +void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; u32 s0 = 0; u32 s1 = 0; @@ -769,64 +768,69 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 u32 s5 = 0; u32 s6 = 0; u32 s7 = 0; - u32 s8 = 0; + + #if defined IS_AMD || defined IS_GENERIC + const u32 src_r00 = swap32_S (src_r0[0]); + const u32 src_r01 = swap32_S (src_r0[1]); + const u32 src_r02 = swap32_S (src_r0[2]); + const u32 src_r03 = swap32_S (src_r0[3]); + const u32 src_r10 = swap32_S (src_r1[0]); + const u32 src_r11 = swap32_S (src_r1[1]); + const u32 src_r12 = swap32_S (src_r1[2]); + const u32 src_r13 = swap32_S (src_r1[3]); switch (offset / 4) { case 0: - s8 = amd_bytealign_S ( 0, src_r1[3], offset_minus_4); - s7 = amd_bytealign_S (src_r1[3], src_r1[2], offset_minus_4); - s6 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); - s5 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s4 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s3 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s2 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s1 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s0 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r12, src_r13, offset); + s6 = amd_bytealign_S (src_r11, src_r12, offset); + s5 = amd_bytealign_S (src_r10, src_r11, offset); + s4 = amd_bytealign_S (src_r03, src_r10, offset); + s3 = amd_bytealign_S (src_r02, src_r03, offset); + s2 = amd_bytealign_S (src_r01, src_r02, offset); + s1 = amd_bytealign_S (src_r00, src_r01, offset); + s0 = amd_bytealign_S ( 0, src_r00, offset); break; case 1: - s8 = amd_bytealign_S ( 0, src_r1[2], offset_minus_4); - s7 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); - s6 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s5 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s4 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s3 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s2 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s1 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r11, src_r12, offset); + s6 = amd_bytealign_S (src_r10, src_r11, offset); + s5 = amd_bytealign_S (src_r03, src_r10, offset); + s4 = amd_bytealign_S (src_r02, src_r03, offset); + s3 = amd_bytealign_S (src_r01, src_r02, offset); + s2 = amd_bytealign_S (src_r00, src_r01, offset); + s1 = amd_bytealign_S ( 0, src_r00, offset); s0 = 0; break; case 2: - s8 = amd_bytealign_S ( 0, src_r1[1], offset_minus_4); - s7 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s6 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s5 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s4 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s3 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s2 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r10, src_r11, offset); + s6 = amd_bytealign_S (src_r03, src_r10, offset); + s5 = amd_bytealign_S (src_r02, src_r03, offset); + s4 = amd_bytealign_S (src_r01, src_r02, offset); + s3 = amd_bytealign_S (src_r00, src_r01, offset); + s2 = amd_bytealign_S ( 0, src_r00, offset); s1 = 0; s0 = 0; break; case 3: - s8 = amd_bytealign_S ( 0, src_r1[0], offset_minus_4); - s7 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s6 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s5 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s4 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s3 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r03, src_r10, offset); + s6 = amd_bytealign_S (src_r02, src_r03, offset); + s5 = amd_bytealign_S (src_r01, src_r02, offset); + s4 = amd_bytealign_S (src_r00, src_r01, offset); + s3 = amd_bytealign_S ( 0, src_r00, offset); s2 = 0; s1 = 0; s0 = 0; + break; case 4: - s8 = amd_bytealign_S ( 0, src_r0[3], offset_minus_4); - s7 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s6 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s5 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s4 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r02, src_r03, offset); + s6 = amd_bytealign_S (src_r01, src_r02, offset); + s5 = amd_bytealign_S (src_r00, src_r01, offset); + s4 = amd_bytealign_S ( 0, src_r00, offset); s3 = 0; s2 = 0; s1 = 0; @@ -834,10 +838,9 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 5: - s8 = amd_bytealign_S ( 0, src_r0[2], offset_minus_4); - s7 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s6 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s5 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r01, src_r02, offset); + s6 = amd_bytealign_S (src_r00, src_r01, offset); + s5 = amd_bytealign_S ( 0, src_r00, offset); s4 = 0; s3 = 0; s2 = 0; @@ -846,9 +849,8 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 6: - s8 = amd_bytealign_S ( 0, src_r0[1], offset_minus_4); - s7 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s6 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r00, src_r01, offset); + s6 = amd_bytealign_S ( 0, src_r00, offset); s5 = 0; s4 = 0; s3 = 0; @@ -858,8 +860,7 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 7: - s8 = amd_bytealign_S ( 0, src_r0[0], offset_minus_4); - s7 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S ( 0, src_r00, offset); s6 = 0; s5 = 0; s4 = 0; @@ -870,85 +871,69 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; } - if (offset_mod_4 == 0) - { - buf0[0] = src_l0[0] | s1; - buf0[1] = src_l0[1] | s2; - buf0[2] = src_l0[2] | s3; - buf0[3] = src_l0[3] | s4; - buf1[0] = src_l1[0] | s5; - buf1[1] = src_l1[1] | s6; - buf1[2] = src_l1[2] | s7; - buf1[3] = src_l1[3] | s8; - } - else - { - buf0[0] = src_l0[0] | s0; - buf0[1] = src_l0[1] | s1; - buf0[2] = src_l0[2] | s2; - buf0[3] = src_l0[3] | s3; - buf1[0] = src_l1[0] | s4; - buf1[1] = src_l1[1] | s5; - buf1[2] = src_l1[2] | s6; - buf1[3] = src_l1[3] | s7; - } + s0 = swap32_S (s0); + s1 = swap32_S (s1); + s2 = swap32_S (s2); + s3 = swap32_S (s3); + s4 = swap32_S (s4); + s5 = swap32_S (s5); + s6 = swap32_S (s6); + s7 = swap32_S (s7); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - u32 s0 = 0; - u32 s1 = 0; - u32 s2 = 0; - u32 s3 = 0; - u32 s4 = 0; - u32 s5 = 0; - u32 s6 = 0; - u32 s7 = 0; + const u32 src_r00 = src_r0[0]; + const u32 src_r01 = src_r0[1]; + const u32 src_r02 = src_r0[2]; + const u32 src_r03 = src_r0[3]; + const u32 src_r10 = src_r1[0]; + const u32 src_r11 = src_r1[1]; + const u32 src_r12 = src_r1[2]; + const u32 src_r13 = src_r1[3]; switch (offset / 4) { case 0: - s7 = __byte_perm_S (src_r1[2], src_r1[3], selector); - s6 = __byte_perm_S (src_r1[1], src_r1[2], selector); - s5 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s4 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s3 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s2 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s1 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s0 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r12, src_r13, selector); + s6 = __byte_perm_S (src_r11, src_r12, selector); + s5 = __byte_perm_S (src_r10, src_r11, selector); + s4 = __byte_perm_S (src_r03, src_r10, selector); + s3 = __byte_perm_S (src_r02, src_r03, selector); + s2 = __byte_perm_S (src_r01, src_r02, selector); + s1 = __byte_perm_S (src_r00, src_r01, selector); + s0 = __byte_perm_S ( 0, src_r00, selector); break; case 1: - s7 = __byte_perm_S (src_r1[1], src_r1[2], selector); - s6 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s5 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s4 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s3 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s2 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s1 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r11, src_r12, selector); + s6 = __byte_perm_S (src_r10, src_r11, selector); + s5 = __byte_perm_S (src_r03, src_r10, selector); + s4 = __byte_perm_S (src_r02, src_r03, selector); + s3 = __byte_perm_S (src_r01, src_r02, selector); + s2 = __byte_perm_S (src_r00, src_r01, selector); + s1 = __byte_perm_S ( 0, src_r00, selector); s0 = 0; break; case 2: - s7 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s6 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s5 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s4 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s3 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s2 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r10, src_r11, selector); + s6 = __byte_perm_S (src_r03, src_r10, selector); + s5 = __byte_perm_S (src_r02, src_r03, selector); + s4 = __byte_perm_S (src_r01, src_r02, selector); + s3 = __byte_perm_S (src_r00, src_r01, selector); + s2 = __byte_perm_S ( 0, src_r00, selector); s1 = 0; s0 = 0; break; case 3: - s7 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s6 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s5 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s4 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s3 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r03, src_r10, selector); + s6 = __byte_perm_S (src_r02, src_r03, selector); + s5 = __byte_perm_S (src_r01, src_r02, selector); + s4 = __byte_perm_S (src_r00, src_r01, selector); + s3 = __byte_perm_S ( 0, src_r00, selector); s2 = 0; s1 = 0; s0 = 0; @@ -956,10 +941,10 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 4: - s7 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s6 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s5 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s4 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r02, src_r03, selector); + s6 = __byte_perm_S (src_r01, src_r02, selector); + s5 = __byte_perm_S (src_r00, src_r01, selector); + s4 = __byte_perm_S ( 0, src_r00, selector); s3 = 0; s2 = 0; s1 = 0; @@ -967,9 +952,9 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 5: - s7 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s6 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s5 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r01, src_r02, selector); + s6 = __byte_perm_S (src_r00, src_r01, selector); + s5 = __byte_perm_S ( 0, src_r00, selector); s4 = 0; s3 = 0; s2 = 0; @@ -978,8 +963,8 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 6: - s7 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s6 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r00, src_r01, selector); + s6 = __byte_perm_S ( 0, src_r00, selector); s5 = 0; s4 = 0; s3 = 0; @@ -989,7 +974,7 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 7: - s7 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S ( 0, src_r00, selector); s6 = 0; s5 = 0; s4 = 0; @@ -999,6 +984,7 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 s0 = 0; break; } + #endif buf0[0] = src_l0[0] | s0; buf0[1] = src_l0[1] | s1; @@ -1008,11 +994,9 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 buf1[1] = src_l1[1] | s5; buf1[2] = src_l1[2] | s6; buf1[3] = src_l1[3] | s7; - - #endif } -inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) +void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) { rshift_block_N (in0, in1, out0, out1, 32 - len); @@ -1038,7 +1022,7 @@ inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], con out1[3] = swap32_S (tib41[3]); } -inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); @@ -1052,7 +1036,7 @@ inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] &= ~(generate_cmask (buf0[0])); buf0[1] &= ~(generate_cmask (buf0[1])); @@ -1066,7 +1050,7 @@ inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { rule_op_mangle_lrest (p0, p1, buf0, buf1, in_len); @@ -1075,7 +1059,7 @@ inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { rule_op_mangle_urest (p0, p1, buf0, buf1, in_len); @@ -1084,7 +1068,7 @@ inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] ^= (generate_cmask (buf0[0])); buf0[1] ^= (generate_cmask (buf0[1])); @@ -1098,7 +1082,7 @@ inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1119,14 +1103,14 @@ inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { reverse_block (buf0, buf1, buf0, buf1, in_len); return in_len; } -inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1139,7 +1123,7 @@ inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (((in_len * p0) + in_len) >= 32) return (in_len); @@ -1167,7 +1151,7 @@ inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1185,7 +1169,7 @@ inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1198,7 +1182,7 @@ inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1213,7 +1197,7 @@ inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1237,7 +1221,7 @@ inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1267,7 +1251,7 @@ inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1278,7 +1262,7 @@ inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1298,7 +1282,7 @@ inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1373,7 +1357,7 @@ inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1388,7 +1372,7 @@ inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1474,7 +1458,7 @@ inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf return out_len; } -inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -1546,7 +1530,7 @@ inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1569,7 +1553,7 @@ inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1578,7 +1562,7 @@ inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], return p0; } -inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { const uchar4 tmp0 = (uchar4) (p0); const uchar4 tmp1 = (uchar4) (p1); @@ -1597,7 +1581,7 @@ inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { u32 out_len = 0; @@ -1638,13 +1622,13 @@ inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -inline u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { // TODO return in_len; } -inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1831,7 +1815,7 @@ inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1865,7 +1849,7 @@ inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4] return out_len; } -inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + in_len) >= 32) return (in_len); @@ -1898,7 +1882,7 @@ inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], return out_len; } -inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1907,7 +1891,7 @@ inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1992,7 +1976,7 @@ inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); if (p1 >= in_len) return (in_len); @@ -2239,7 +2223,7 @@ inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2261,7 +2245,7 @@ inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2283,7 +2267,7 @@ inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2307,7 +2291,7 @@ inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2331,7 +2315,7 @@ inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((p0 + 1) >= in_len) return (in_len); @@ -2358,7 +2342,7 @@ inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 == 0) return (in_len); @@ -2387,7 +2371,7 @@ inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2425,7 +2409,7 @@ inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[ return out_len; } -inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2454,7 +2438,7 @@ inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); @@ -2497,7 +2481,7 @@ inline u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { u32 out_len = in_len; @@ -2549,7 +2533,7 @@ inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], return out_len; } -inline u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) +u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) { u32 out_len = len; @@ -2567,7 +2551,7 @@ inline u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], cons return out_len; } -inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, __global const kernel_rule_t *rules_buf, const u32 il_pos, u32x buf0[4], u32x buf1[4]) +u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, __global const kernel_rule_t *rules_buf, const u32 il_pos, u32x buf0[4], u32x buf1[4]) { #if VECT_SIZE == 1 diff --git a/OpenCL/inc_simd.cl b/OpenCL/inc_simd.cl index 37548b44c..ac9f0410f 100644 --- a/OpenCL/inc_simd.cl +++ b/OpenCL/inc_simd.cl @@ -1054,7 +1054,7 @@ // attack-mode 0 -inline u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) +u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) { #if VECT_SIZE == 1 const u32x ix = (u32x) (bfs_buf[il_pos + 0].i); @@ -1073,7 +1073,7 @@ inline u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) // attack-mode 1 -inline u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_pos) +u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_pos) { #if VECT_SIZE == 1 const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len); @@ -1090,7 +1090,7 @@ inline u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_po return pw_lenx; } -inline u32x ix_create_combt (__global const pw_t *combs_buf, const u32 il_pos, const int idx) +u32x ix_create_combt (__global const pw_t *combs_buf, const u32 il_pos, const int idx) { #if VECT_SIZE == 1 const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx]); diff --git a/OpenCL/inc_truecrypt_xts.cl b/OpenCL/inc_truecrypt_xts.cl index eab39e80e..6bf6684d2 100644 --- a/OpenCL/inc_truecrypt_xts.cl +++ b/OpenCL/inc_truecrypt_xts.cl @@ -150,7 +150,7 @@ void twofish256_decrypt_xts_next (const u32 *in, u32 *out, u32 *T, u32 *sk, u32 // 512 bit -int verify_header_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_aes (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_aes[60]; @@ -206,7 +206,7 @@ int verify_header_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *u return 1; } -int verify_header_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) +int verify_header_serpent (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) { u32 ks_serpent[140]; @@ -262,7 +262,7 @@ int verify_header_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u3 return 1; } -int verify_header_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) +int verify_header_twofish (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) { u32 sk_twofish[4]; u32 lk_twofish[40]; @@ -321,7 +321,7 @@ int verify_header_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, const u3 // 1024 bit -int verify_header_aes_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_aes_twofish (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_aes[60]; @@ -384,7 +384,7 @@ int verify_header_aes_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, cons return 1; } -int verify_header_serpent_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_serpent_aes (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_serpent[140]; u32 ks_aes[60]; @@ -445,7 +445,7 @@ int verify_header_serpent_aes (__global tc_t *esalt_bufs, const u32 *ukey1, cons return 1; } -int verify_header_twofish_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4) +int verify_header_twofish_serpent (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4) { u32 sk_twofish[4]; u32 lk_twofish[40]; @@ -510,7 +510,7 @@ int verify_header_twofish_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, // 1536 bit -int verify_header_aes_twofish_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_aes_twofish_serpent (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_aes[60]; @@ -579,7 +579,7 @@ int verify_header_aes_twofish_serpent (__global tc_t *esalt_bufs, const u32 *uke return 1; } -int verify_header_serpent_twofish_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_serpent_twofish_aes (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_serpent[140]; diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 2bfb641df..764ffd9c1 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -33,14 +33,14 @@ typedef VTYPE(uint, VECT_SIZE) u32x; typedef VTYPE(ulong, VECT_SIZE) u64x; #endif -inline u32 l32_from_64_S (u64 a) +u32 l32_from_64_S (u64 a) { const u32 r = (u32) (a); return r; } -inline u32 h32_from_64_S (u64 a) +u32 h32_from_64_S (u64 a) { a >>= 32; @@ -49,12 +49,12 @@ inline u32 h32_from_64_S (u64 a) return r; } -inline u64 hl32_to_64_S (const u32 a, const u32 b) +u64 hl32_to_64_S (const u32 a, const u32 b) { return as_ulong ((uint2) (b, a)); } -inline u32x l32_from_64 (u64x a) +u32x l32_from_64 (u64x a) { u32x r; @@ -93,7 +93,7 @@ inline u32x l32_from_64 (u64x a) return r; } -inline u32x h32_from_64 (u64x a) +u32x h32_from_64 (u64x a) { a >>= 32; @@ -134,7 +134,7 @@ inline u32x h32_from_64 (u64x a) return r; } -inline u64x hl32_to_64 (const u32x a, const u32x b) +u64x hl32_to_64 (const u32x a, const u32x b) { u64x r; @@ -174,140 +174,122 @@ inline u64x hl32_to_64 (const u32x a, const u32x b) } #ifdef IS_AMD -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { - return (as_uint (as_uchar4 (v).s3210)); + return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { - return (as_ulong (as_uchar8 (v).s76543210)); + return bitselect (bitselect (rotate (v, 24ul), + rotate (v, 8ul), 0x000000ff000000fful), + bitselect (rotate (v, 56ul), + rotate (v, 40ul), 0x00ff000000ff0000ul), + 0xffff0000ffff0000ul); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { - const u32 a0 = h32_from_64_S (a); - const u32 a1 = l32_from_64_S (a); - - const u32 t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); - const u32 t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - - const u64 r = hl32_to_64_S (t0, t1); - - return r; + return rotate (a, (u64) (64 - n)); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { - return rotr64_S (a, 64 - n); + return rotate (a, (u64) n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { - return ((v >> 24) & 0x000000ff) - | ((v >> 8) & 0x0000ff00) - | ((v << 8) & 0x00ff0000) - | ((v << 24) & 0xff000000); + return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { - return ((v >> 56) & 0x00000000000000ff) - | ((v >> 40) & 0x000000000000ff00) - | ((v >> 24) & 0x0000000000ff0000) - | ((v >> 8) & 0x00000000ff000000) - | ((v << 8) & 0x000000ff00000000) - | ((v << 24) & 0x0000ff0000000000) - | ((v << 40) & 0x00ff000000000000) - | ((v << 56) & 0xff00000000000000); + return bitselect (bitselect (rotate (v, 24ul), + rotate (v, 8ul), 0x000000ff000000fful), + bitselect (rotate (v, 56ul), + rotate (v, 40ul), 0x00ff000000ff0000ul), + 0xffff0000ffff0000ul); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { - const u32x a0 = h32_from_64 (a); - const u32x a1 = l32_from_64 (a); - - const u32x t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); - const u32x t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - - const u64x r = hl32_to_64 (t0, t1); - - return r; + return rotate (a, (u64x) (64 - n)); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { - return rotr64 (a, 64 - n); + return rotate (a, (u64x) n); } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { return amd_bfe (a, b, c); } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { return amd_bfe (a, b, c); } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { return amd_bytealign (a, b, c); } #endif #ifdef IS_NV -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64) (64 - n)); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { - return rotr64_S (a, 64 - n); + return rotate (a, (u64) n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -315,7 +297,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -327,27 +309,27 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64x) (64 - n)); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { - return rotate (a, (u64) n); + return rotate (a, (u64x) n); } -inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) +u32x __byte_perm (const u32x a, const u32x b, const u32x c) { u32x r; @@ -386,7 +368,7 @@ inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) return r; } -inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) +u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -395,7 +377,7 @@ inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) return r; } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { u32x r; @@ -434,7 +416,7 @@ inline u32x __bfe (const u32x a, const u32x b, const u32x c) return r; } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -443,7 +425,7 @@ inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) return r; } -inline u32x amd_bytealign (const u32x a, const u32x b, const u32x c) +u32x amd_bytealign (const u32x a, const u32x b, const u32x c) { u32x r; @@ -490,7 +472,7 @@ inline u32x amd_bytealign (const u32x a, const u32x b, const u32x c) return r; } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -509,37 +491,37 @@ inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) #endif #ifdef IS_GENERIC -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64) (64 - n)); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { return rotate (a, (u64) n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -547,7 +529,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -559,27 +541,27 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64x) (64 - n)); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { - return rotate (a, (u64) n); + return rotate (a, (u64x) n); } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { #define BIT(x) ((u32x) (1u) << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -592,7 +574,7 @@ inline u32x __bfe (const u32x a, const u32x b, const u32x c) #undef BFE } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { #define BIT(x) (1u << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -605,7 +587,7 @@ inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) #undef BFE } -inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) +u32x amd_bytealign (const u32x a, const u32x b, const u32 c) { #if VECT_SIZE == 1 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8); @@ -638,7 +620,7 @@ inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) #endif } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8); @@ -809,10 +791,10 @@ typedef struct bitcoin_wallet typedef struct sip { - u32 salt_buf[30]; + u32 salt_buf[32]; u32 salt_len; - u32 esalt_buf[38]; + u32 esalt_buf[48]; u32 esalt_len; } sip_t; diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index e990b0a31..e6acd7d23 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -153,9 +153,6 @@ #if KERN_TYPE == 13800 #undef _unroll #endif -#if KERN_TYPE == 14100 -#undef _unroll -#endif // nvidia specific @@ -177,6 +174,9 @@ #if KERN_TYPE == 14000 #undef _unroll #endif +#if KERN_TYPE == 14100 +#undef _unroll +#endif #endif #endif @@ -186,25 +186,7 @@ #ifdef IS_AMD #ifdef IS_GPU -#if KERN_TYPE == 1700 -#undef _unroll -#endif -#if KERN_TYPE == 1710 -#undef _unroll -#endif -#if KERN_TYPE == 5200 -#undef _unroll -#endif -#if KERN_TYPE == 10800 -#undef _unroll -#endif -#if KERN_TYPE == 10900 -#undef _unroll -#endif -#if KERN_TYPE == 12800 -#undef _unroll -#endif -#if KERN_TYPE == 12900 +#if KERN_TYPE == 8000 #undef _unroll #endif diff --git a/OpenCL/m00000_a0.cl b/OpenCL/m00000_a0.cl index a30464522..82690362f 100644 --- a/OpenCL/m00000_a0.cl +++ b/OpenCL/m00000_a0.cl @@ -39,8 +39,6 @@ __kernel void m00000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m00000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00000_a3.cl b/OpenCL/m00000_a3.cl index 175c759fd..cb172a09d 100644 --- a/OpenCL/m00000_a3.cl +++ b/OpenCL/m00000_a3.cl @@ -37,8 +37,6 @@ __kernel void m00000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m00000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00010_a0.cl b/OpenCL/m00010_a0.cl index f353def68..76e002021 100644 --- a/OpenCL/m00010_a0.cl +++ b/OpenCL/m00010_a0.cl @@ -39,8 +39,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00010_a1.cl b/OpenCL/m00010_a1.cl index 7a18b79d6..b970e65e5 100644 --- a/OpenCL/m00010_a1.cl +++ b/OpenCL/m00010_a1.cl @@ -37,8 +37,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00010_a3.cl b/OpenCL/m00010_a3.cl index 304bad19d..02bbb9768 100644 --- a/OpenCL/m00010_a3.cl +++ b/OpenCL/m00010_a3.cl @@ -37,8 +37,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00020_a0.cl b/OpenCL/m00020_a0.cl index ec7f7187d..4cd3e5e84 100644 --- a/OpenCL/m00020_a0.cl +++ b/OpenCL/m00020_a0.cl @@ -39,8 +39,6 @@ __kernel void m00020_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00020_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00020_a3.cl b/OpenCL/m00020_a3.cl index fb46e8374..c22d90731 100644 --- a/OpenCL/m00020_a3.cl +++ b/OpenCL/m00020_a3.cl @@ -37,8 +37,6 @@ __kernel void m00020_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00020_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00030_a0.cl b/OpenCL/m00030_a0.cl index 81fe98a62..349802d62 100644 --- a/OpenCL/m00030_a0.cl +++ b/OpenCL/m00030_a0.cl @@ -39,8 +39,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00030_a1.cl b/OpenCL/m00030_a1.cl index 2160c48e9..babf8745d 100644 --- a/OpenCL/m00030_a1.cl +++ b/OpenCL/m00030_a1.cl @@ -37,8 +37,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00030_a3.cl b/OpenCL/m00030_a3.cl index 10c2b0c00..d750b40cb 100644 --- a/OpenCL/m00030_a3.cl +++ b/OpenCL/m00030_a3.cl @@ -37,8 +37,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00040_a0.cl b/OpenCL/m00040_a0.cl index bf4aa3ff9..af4b80719 100644 --- a/OpenCL/m00040_a0.cl +++ b/OpenCL/m00040_a0.cl @@ -39,8 +39,6 @@ __kernel void m00040_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00040_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00040_a3.cl b/OpenCL/m00040_a3.cl index be5995613..af341b374 100644 --- a/OpenCL/m00040_a3.cl +++ b/OpenCL/m00040_a3.cl @@ -37,8 +37,6 @@ __kernel void m00040_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00040_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00050_a0.cl b/OpenCL/m00050_a0.cl index 263ef488f..51e5e14f6 100644 --- a/OpenCL/m00050_a0.cl +++ b/OpenCL/m00050_a0.cl @@ -39,8 +39,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00050_a1.cl b/OpenCL/m00050_a1.cl index 7d75b6e4f..702ebebbe 100644 --- a/OpenCL/m00050_a1.cl +++ b/OpenCL/m00050_a1.cl @@ -37,8 +37,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -82,13 +78,13 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - md5_hmac_ctx_vector_t ctx; + md5_hmac_ctx_t ctx; - md5_hmac_init_vector (&ctx, c, pw_len + comb_len); + md5_hmac_init (&ctx, c, pw_len + comb_len); - md5_hmac_update_vector (&ctx, s, salt_len); + md5_hmac_update (&ctx, s, salt_len); - md5_hmac_final_vector (&ctx); + md5_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; @@ -135,8 +131,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -180,13 +172,13 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - md5_hmac_ctx_vector_t ctx; + md5_hmac_ctx_t ctx; - md5_hmac_init_vector (&ctx, c, pw_len + comb_len); + md5_hmac_init (&ctx, c, pw_len + comb_len); - md5_hmac_update_vector (&ctx, s, salt_len); + md5_hmac_update (&ctx, s, salt_len); - md5_hmac_final_vector (&ctx); + md5_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; diff --git a/OpenCL/m00050_a3.cl b/OpenCL/m00050_a3.cl index 09f223a60..209ecdb74 100644 --- a/OpenCL/m00050_a3.cl +++ b/OpenCL/m00050_a3.cl @@ -37,8 +37,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00060_a0.cl b/OpenCL/m00060_a0.cl index ea4b96827..75369b728 100644 --- a/OpenCL/m00060_a0.cl +++ b/OpenCL/m00060_a0.cl @@ -39,8 +39,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; diff --git a/OpenCL/m00060_a1.cl b/OpenCL/m00060_a1.cl index a0e4f0a23..6cff7c337 100644 --- a/OpenCL/m00060_a1.cl +++ b/OpenCL/m00060_a1.cl @@ -37,8 +37,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; diff --git a/OpenCL/m00060_a3.cl b/OpenCL/m00060_a3.cl index 1c1d79a29..c1f165249 100644 --- a/OpenCL/m00060_a3.cl +++ b/OpenCL/m00060_a3.cl @@ -37,8 +37,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m00100_a0.cl b/OpenCL/m00100_a0.cl index a8bd7ec57..c14eaef04 100644 --- a/OpenCL/m00100_a0.cl +++ b/OpenCL/m00100_a0.cl @@ -39,8 +39,6 @@ __kernel void m00100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m00100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00100_a3.cl b/OpenCL/m00100_a3.cl index 50ecd137b..f125ff993 100644 --- a/OpenCL/m00100_a3.cl +++ b/OpenCL/m00100_a3.cl @@ -37,8 +37,6 @@ __kernel void m00100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m00100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00110_a0.cl b/OpenCL/m00110_a0.cl index da9c38d03..030b0ca2e 100644 --- a/OpenCL/m00110_a0.cl +++ b/OpenCL/m00110_a0.cl @@ -39,8 +39,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00110_a1.cl b/OpenCL/m00110_a1.cl index 3d8987f72..e9a349591 100644 --- a/OpenCL/m00110_a1.cl +++ b/OpenCL/m00110_a1.cl @@ -37,8 +37,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00110_a3.cl b/OpenCL/m00110_a3.cl index 00418eafc..6320923dd 100644 --- a/OpenCL/m00110_a3.cl +++ b/OpenCL/m00110_a3.cl @@ -37,8 +37,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00120_a0.cl b/OpenCL/m00120_a0.cl index 6849a85a8..78dca9c65 100644 --- a/OpenCL/m00120_a0.cl +++ b/OpenCL/m00120_a0.cl @@ -39,8 +39,6 @@ __kernel void m00120_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00120_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00120_a3.cl b/OpenCL/m00120_a3.cl index 178dd369e..04ee961f2 100644 --- a/OpenCL/m00120_a3.cl +++ b/OpenCL/m00120_a3.cl @@ -37,8 +37,6 @@ __kernel void m00120_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00120_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00130_a0.cl b/OpenCL/m00130_a0.cl index ae3d7372c..859b3c2c1 100644 --- a/OpenCL/m00130_a0.cl +++ b/OpenCL/m00130_a0.cl @@ -39,8 +39,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00130_a1.cl b/OpenCL/m00130_a1.cl index 2f7f11bd4..4d6b8e020 100644 --- a/OpenCL/m00130_a1.cl +++ b/OpenCL/m00130_a1.cl @@ -37,8 +37,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00130_a3.cl b/OpenCL/m00130_a3.cl index 8024f2233..b1cf6ddcc 100644 --- a/OpenCL/m00130_a3.cl +++ b/OpenCL/m00130_a3.cl @@ -37,8 +37,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -124,8 +120,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -137,8 +131,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00140_a0.cl b/OpenCL/m00140_a0.cl index 1b8c5e717..aa9c8b04b 100644 --- a/OpenCL/m00140_a0.cl +++ b/OpenCL/m00140_a0.cl @@ -39,8 +39,6 @@ __kernel void m00140_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00140_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00140_a3.cl b/OpenCL/m00140_a3.cl index 75d22495b..91ad701ac 100644 --- a/OpenCL/m00140_a3.cl +++ b/OpenCL/m00140_a3.cl @@ -37,8 +37,6 @@ __kernel void m00140_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00140_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00150_a0.cl b/OpenCL/m00150_a0.cl index 957d2a50b..3e4d11f88 100644 --- a/OpenCL/m00150_a0.cl +++ b/OpenCL/m00150_a0.cl @@ -39,8 +39,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00150_a1.cl b/OpenCL/m00150_a1.cl index 15ab612bf..9b55986b6 100644 --- a/OpenCL/m00150_a1.cl +++ b/OpenCL/m00150_a1.cl @@ -37,8 +37,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -82,13 +78,13 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha1_hmac_ctx_vector_t ctx; + sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha1_hmac_init (&ctx, c, pw_len + comb_len); - sha1_hmac_update_vector (&ctx, s, salt_len); + sha1_hmac_update (&ctx, s, salt_len); - sha1_hmac_final_vector (&ctx); + sha1_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; @@ -135,8 +131,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -180,13 +172,13 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha1_hmac_ctx_vector_t ctx; + sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha1_hmac_init (&ctx, c, pw_len + comb_len); - sha1_hmac_update_vector (&ctx, s, salt_len); + sha1_hmac_update (&ctx, s, salt_len); - sha1_hmac_final_vector (&ctx); + sha1_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; diff --git a/OpenCL/m00150_a3.cl b/OpenCL/m00150_a3.cl index 27c7dc7dd..195218abf 100644 --- a/OpenCL/m00150_a3.cl +++ b/OpenCL/m00150_a3.cl @@ -37,8 +37,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00160_a0.cl b/OpenCL/m00160_a0.cl index c87a1bab7..d32cdac55 100644 --- a/OpenCL/m00160_a0.cl +++ b/OpenCL/m00160_a0.cl @@ -39,8 +39,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; diff --git a/OpenCL/m00160_a1.cl b/OpenCL/m00160_a1.cl index 6a4907c55..b8329a326 100644 --- a/OpenCL/m00160_a1.cl +++ b/OpenCL/m00160_a1.cl @@ -37,8 +37,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; diff --git a/OpenCL/m00160_a3.cl b/OpenCL/m00160_a3.cl index cdcca3d64..1712e8064 100644 --- a/OpenCL/m00160_a3.cl +++ b/OpenCL/m00160_a3.cl @@ -37,8 +37,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m00300_a0.cl b/OpenCL/m00300_a0.cl index 04e7f4940..d05d17076 100644 --- a/OpenCL/m00300_a0.cl +++ b/OpenCL/m00300_a0.cl @@ -39,8 +39,6 @@ __kernel void m00300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -127,8 +125,6 @@ __kernel void m00300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00300_a3.cl b/OpenCL/m00300_a3.cl index 6a9763f3f..d68b2608c 100644 --- a/OpenCL/m00300_a3.cl +++ b/OpenCL/m00300_a3.cl @@ -37,8 +37,6 @@ __kernel void m00300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -131,8 +129,6 @@ __kernel void m00300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00400.cl b/OpenCL/m00400.cl index 821e4a02a..5d3ad84b4 100644 --- a/OpenCL/m00400.cl +++ b/OpenCL/m00400.cl @@ -76,8 +76,6 @@ __kernel void m00400_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } u32 digest[4]; diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl index 81cd4f8bf..5ccb727aa 100644 --- a/OpenCL/m00500-optimized.cl +++ b/OpenCL/m00500-optimized.cl @@ -10,111 +10,14 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" #define md5crypt_magic 0x00243124u -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -122,44 +25,45 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -226,7 +130,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -234,44 +138,47 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -338,44 +245,41 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m00500.cl b/OpenCL/m00500.cl index 54422ec96..80371e61d 100644 --- a/OpenCL/m00500.cl +++ b/OpenCL/m00500.cl @@ -40,8 +40,6 @@ __kernel void m00500_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -53,8 +51,6 @@ __kernel void m00500_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -154,8 +150,6 @@ __kernel void m00500_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -167,8 +161,6 @@ __kernel void m00500_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00900_a0.cl b/OpenCL/m00900_a0.cl index cf19e1c05..a58b5f934 100644 --- a/OpenCL/m00900_a0.cl +++ b/OpenCL/m00900_a0.cl @@ -39,8 +39,6 @@ __kernel void m00900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m00900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00900_a3.cl b/OpenCL/m00900_a3.cl index 480a1e3c4..52ab4da60 100644 --- a/OpenCL/m00900_a3.cl +++ b/OpenCL/m00900_a3.cl @@ -37,8 +37,6 @@ __kernel void m00900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m00900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01000_a0.cl b/OpenCL/m01000_a0.cl index d01ddc0d5..c28b005e6 100644 --- a/OpenCL/m01000_a0.cl +++ b/OpenCL/m01000_a0.cl @@ -39,8 +39,6 @@ __kernel void m01000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01000_a3.cl b/OpenCL/m01000_a3.cl index a9a421686..d4048fffb 100644 --- a/OpenCL/m01000_a3.cl +++ b/OpenCL/m01000_a3.cl @@ -37,8 +37,6 @@ __kernel void m01000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01100_a0.cl b/OpenCL/m01100_a0.cl index e257ed697..9dc3d1638 100644 --- a/OpenCL/m01100_a0.cl +++ b/OpenCL/m01100_a0.cl @@ -39,21 +39,17 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -132,21 +128,17 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01100_a1.cl b/OpenCL/m01100_a1.cl index 56fa3bfda..893122399 100644 --- a/OpenCL/m01100_a1.cl +++ b/OpenCL/m01100_a1.cl @@ -32,13 +32,11 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md4_ctx_t ctx0; @@ -114,13 +112,11 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md4_ctx_t ctx0; diff --git a/OpenCL/m01100_a3.cl b/OpenCL/m01100_a3.cl index 461b834a8..03e59c812 100644 --- a/OpenCL/m01100_a3.cl +++ b/OpenCL/m01100_a3.cl @@ -37,8 +37,6 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -136,8 +132,6 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -149,8 +143,6 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01300_a0.cl b/OpenCL/m01300_a0.cl index 7607c95f8..588e58786 100644 --- a/OpenCL/m01300_a0.cl +++ b/OpenCL/m01300_a0.cl @@ -39,8 +39,6 @@ __kernel void m01300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01300_a3.cl b/OpenCL/m01300_a3.cl index cbb1203a8..d756fd264 100644 --- a/OpenCL/m01300_a3.cl +++ b/OpenCL/m01300_a3.cl @@ -37,8 +37,6 @@ __kernel void m01300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01400_a0.cl b/OpenCL/m01400_a0.cl index 3f07773af..b26194a63 100644 --- a/OpenCL/m01400_a0.cl +++ b/OpenCL/m01400_a0.cl @@ -39,8 +39,6 @@ __kernel void m01400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01400_a3.cl b/OpenCL/m01400_a3.cl index 6595c7b9a..3cba71a25 100644 --- a/OpenCL/m01400_a3.cl +++ b/OpenCL/m01400_a3.cl @@ -37,8 +37,6 @@ __kernel void m01400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01410_a0.cl b/OpenCL/m01410_a0.cl index 185c9e132..b517e02bd 100644 --- a/OpenCL/m01410_a0.cl +++ b/OpenCL/m01410_a0.cl @@ -39,8 +39,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01410_a1.cl b/OpenCL/m01410_a1.cl index bf7a01885..b7d22426c 100644 --- a/OpenCL/m01410_a1.cl +++ b/OpenCL/m01410_a1.cl @@ -37,8 +37,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01410_a3.cl b/OpenCL/m01410_a3.cl index a1a7bd150..dd860cce1 100644 --- a/OpenCL/m01410_a3.cl +++ b/OpenCL/m01410_a3.cl @@ -37,8 +37,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01420_a0.cl b/OpenCL/m01420_a0.cl index 4733245f6..6adc7367f 100644 --- a/OpenCL/m01420_a0.cl +++ b/OpenCL/m01420_a0.cl @@ -39,8 +39,6 @@ __kernel void m01420_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01420_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01420_a3.cl b/OpenCL/m01420_a3.cl index 02784ecd0..11526045b 100644 --- a/OpenCL/m01420_a3.cl +++ b/OpenCL/m01420_a3.cl @@ -37,8 +37,6 @@ __kernel void m01420_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01420_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01430_a0.cl b/OpenCL/m01430_a0.cl index 45ef46ba9..e3236eef1 100644 --- a/OpenCL/m01430_a0.cl +++ b/OpenCL/m01430_a0.cl @@ -39,8 +39,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01430_a1.cl b/OpenCL/m01430_a1.cl index 2d0efa682..8eef1f177 100644 --- a/OpenCL/m01430_a1.cl +++ b/OpenCL/m01430_a1.cl @@ -37,8 +37,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01430_a3.cl b/OpenCL/m01430_a3.cl index 2a6ccc3f8..3259e7c17 100644 --- a/OpenCL/m01430_a3.cl +++ b/OpenCL/m01430_a3.cl @@ -37,8 +37,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -124,8 +120,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -137,8 +131,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01440_a0.cl b/OpenCL/m01440_a0.cl index 4dcf235e0..271d4367e 100644 --- a/OpenCL/m01440_a0.cl +++ b/OpenCL/m01440_a0.cl @@ -39,8 +39,6 @@ __kernel void m01440_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01440_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01440_a3.cl b/OpenCL/m01440_a3.cl index a01604ac3..316e0d156 100644 --- a/OpenCL/m01440_a3.cl +++ b/OpenCL/m01440_a3.cl @@ -37,8 +37,6 @@ __kernel void m01440_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01440_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01450_a0.cl b/OpenCL/m01450_a0.cl index 21fa01b0c..e1462f303 100644 --- a/OpenCL/m01450_a0.cl +++ b/OpenCL/m01450_a0.cl @@ -39,8 +39,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01450_a1.cl b/OpenCL/m01450_a1.cl index c482c7626..7f7c9f635 100644 --- a/OpenCL/m01450_a1.cl +++ b/OpenCL/m01450_a1.cl @@ -37,8 +37,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -82,13 +78,13 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha256_hmac_ctx_vector_t ctx; + sha256_hmac_ctx_t ctx; - sha256_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha256_hmac_init (&ctx, c, pw_len + comb_len); - sha256_hmac_update_vector (&ctx, s, salt_len); + sha256_hmac_update (&ctx, s, salt_len); - sha256_hmac_final_vector (&ctx); + sha256_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; @@ -135,8 +131,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -180,13 +172,13 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha256_hmac_ctx_vector_t ctx; + sha256_hmac_ctx_t ctx; - sha256_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha256_hmac_init (&ctx, c, pw_len + comb_len); - sha256_hmac_update_vector (&ctx, s, salt_len); + sha256_hmac_update (&ctx, s, salt_len); - sha256_hmac_final_vector (&ctx); + sha256_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; diff --git a/OpenCL/m01450_a3.cl b/OpenCL/m01450_a3.cl index c55b53eae..35bd59a2a 100644 --- a/OpenCL/m01450_a3.cl +++ b/OpenCL/m01450_a3.cl @@ -37,8 +37,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01460_a0.cl b/OpenCL/m01460_a0.cl index 0b14e52e9..d51c28cb6 100644 --- a/OpenCL/m01460_a0.cl +++ b/OpenCL/m01460_a0.cl @@ -39,8 +39,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; diff --git a/OpenCL/m01460_a1.cl b/OpenCL/m01460_a1.cl index d4b0fdb8b..61c30c095 100644 --- a/OpenCL/m01460_a1.cl +++ b/OpenCL/m01460_a1.cl @@ -37,8 +37,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; diff --git a/OpenCL/m01460_a3-optimized.cl b/OpenCL/m01460_a3-optimized.cl index 156b75bcc..d20d313ff 100644 --- a/OpenCL/m01460_a3-optimized.cl +++ b/OpenCL/m01460_a3-optimized.cl @@ -221,7 +221,7 @@ void hmac_sha256_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[ sha256_transform (w0, w1, w2, w3, digest); } -void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global void *esal_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier diff --git a/OpenCL/m01460_a3.cl b/OpenCL/m01460_a3.cl index 646ed4732..d7ac9bedf 100644 --- a/OpenCL/m01460_a3.cl +++ b/OpenCL/m01460_a3.cl @@ -37,8 +37,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m01500_a3.cl b/OpenCL/m01500_a3.cl index f54007b40..e372251c4 100644 --- a/OpenCL/m01500_a3.cl +++ b/OpenCL/m01500_a3.cl @@ -14,7 +14,20 @@ #define COMPARE_S "inc_comp_single_bs.cl" #define COMPARE_M "inc_comp_multi_bs.cl" -#define myselx(a,b,c) ((c) ? (b) : (a)) +#ifdef IS_NV +#define KXX_DECL +#define sXXX_DECL +#endif + +#ifdef IS_AMD +#define KXX_DECL +#define sXXX_DECL +#endif + +#ifdef IS_GENERIC +#define KXX_DECL +#define sXXX_DECL +#endif #ifdef IS_NV @@ -888,11 +901,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -911,473 +924,561 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif +//#define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; } #define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; } #define DATASWAP \ @@ -1431,37 +1532,24 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #define KEYSET07 { k00 = K31; k01 = K35; k02 = K52; k03 = K43; k04 = K08; k05 = K37; k06 = K51; k07 = K15; k08 = K49; k09 = K30; k10 = K07; k11 = K02; k12 = K50; k13 = K21; k14 = K45; k15 = K44; k16 = K29; k17 = K16; k18 = K42; k19 = K23; k20 = K22; k21 = K14; k22 = K38; k23 = K01; k24 = K10; k25 = K47; k26 = K53; k27 = K11; k28 = K27; k29 = K26; k30 = K05; k31 = K17; k32 = K54; k33 = K41; k34 = K39; k35 = K20; k36 = K48; k37 = K13; k38 = K24; k39 = K19; k40 = K32; k41 = K40; k42 = K34; k43 = K03; k44 = K06; k45 = K18; k46 = K12; k47 = K46; } #define KEYSET17 { k00 = K15; k01 = K51; k02 = K36; k03 = K02; k04 = K49; k05 = K21; k06 = K35; k07 = K31; k08 = K08; k09 = K14; k10 = K23; k11 = K43; k12 = K09; k13 = K37; k14 = K29; k15 = K28; k16 = K45; k17 = K00; k18 = K01; k19 = K07; k20 = K38; k21 = K30; k22 = K22; k23 = K42; k24 = K26; k25 = K04; k26 = K41; k27 = K54; k28 = K39; k29 = K10; k30 = K48; k31 = K33; k32 = K11; k33 = K53; k34 = K27; k35 = K32; k36 = K05; k37 = K25; k38 = K40; k39 = K03; k40 = K20; k41 = K24; k42 = K46; k43 = K19; k44 = K18; k45 = K06; k46 = K55; k47 = K34; } -#ifdef IS_NV -#define KXX_DECL -#define sXXX_DECL -#endif - -#ifdef IS_AMD -#define KXX_DECL -#define sXXX_DECL -#endif - -#ifdef IS_GENERIC -#define KXX_DECL -#define sXXX_DECL -#endif +#define myselx(a,b,c) ((c) ? (b) : (a)) #ifdef DESCRYPT_SALT void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63) { - sXXX_DECL u32 s001 = (0x001 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s002 = (0x002 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s004 = (0x004 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s008 = (0x008 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s010 = (0x010 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s020 = (0x020 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s040 = (0x040 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s080 = (0x080 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s100 = (0x100 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s200 = (0x200 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s400 = (0x400 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s800 = (0x800 & DESCRYPT_SALT) ? 0xffffffff : 0; + sXXX_DECL u32 s001 = (0x001 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s002 = (0x002 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s004 = (0x004 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s008 = (0x008 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s010 = (0x010 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s020 = (0x020 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s040 = (0x040 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s080 = (0x080 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s100 = (0x100 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s200 = (0x200 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s400 = (0x400 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s800 = (0x800 & DESCRYPT_SALT) ? 1 : 0; KXX_DECL u32 k00, k01, k02, k03, k04, k05; KXX_DECL u32 k06, k07, k08, k09, k10, k11; @@ -1474,60 +1562,6 @@ void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, cons for (u32 ii = 0; ii < 25; ii++) { - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(myselx (*D63, *D47, s001) ^ k00, myselx (*D32, *D48, s002) ^ k01, myselx (*D33, *D49, s004) ^ k02, myselx (*D34, *D50, s008) ^ k03, myselx (*D35, *D51, s010) ^ k04, myselx (*D36, *D52, s020) ^ k05, D08, D16, D22, D30); - s2(myselx (*D35, *D51, s040) ^ k06, myselx (*D36, *D52, s080) ^ k07, myselx (*D37, *D53, s100) ^ k08, myselx (*D38, *D54, s200) ^ k09, myselx (*D39, *D55, s400) ^ k10, myselx (*D40, *D56, s800) ^ k11, D12, D27, D01, D17); - s3( *D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4( *D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(myselx (*D47, *D63, s001) ^ k24, myselx (*D48, *D32, s002) ^ k25, myselx (*D49, *D33, s004) ^ k26, myselx (*D50, *D34, s008) ^ k27, myselx (*D51, *D35, s010) ^ k28, myselx (*D52, *D36, s020) ^ k29, D07, D13, D24, D02); - s6(myselx (*D51, *D35, s040) ^ k30, myselx (*D52, *D36, s080) ^ k31, myselx (*D53, *D37, s100) ^ k32, myselx (*D54, *D38, s200) ^ k33, myselx (*D55, *D39, s400) ^ k34, myselx (*D56, *D40, s800) ^ k35, D03, D28, D10, D18); - s7( *D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8( *D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(myselx (*D31, *D15, s001) ^ k00, myselx (*D00, *D16, s002) ^ k01, myselx (*D01, *D17, s004) ^ k02, myselx (*D02, *D18, s008) ^ k03, myselx (*D03, *D19, s010) ^ k04, myselx (*D04, *D20, s020) ^ k05, D40, D48, D54, D62); - s2(myselx (*D03, *D19, s040) ^ k06, myselx (*D04, *D20, s080) ^ k07, myselx (*D05, *D21, s100) ^ k08, myselx (*D06, *D22, s200) ^ k09, myselx (*D07, *D23, s400) ^ k10, myselx (*D08, *D24, s800) ^ k11, D44, D59, D33, D49); - s3( *D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4( *D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(myselx (*D15, *D31, s001) ^ k24, myselx (*D16, *D00, s002) ^ k25, myselx (*D17, *D01, s004) ^ k26, myselx (*D18, *D02, s008) ^ k27, myselx (*D19, *D03, s010) ^ k28, myselx (*D20, *D04, s020) ^ k29, D39, D45, D56, D34); - s6(myselx (*D19, *D03, s040) ^ k30, myselx (*D20, *D04, s080) ^ k31, myselx (*D21, *D05, s100) ^ k32, myselx (*D22, *D06, s200) ^ k33, myselx (*D23, *D07, s400) ^ k34, myselx (*D24, *D08, s800) ^ k35, D35, D60, D42, D50); - s7( *D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8( *D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1622,8 +1656,6 @@ void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, cons s8( *D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - #endif - DATASWAP; } diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl index fd5c4f293..af6203263 100644 --- a/OpenCL/m01600-optimized.cl +++ b/OpenCL/m01600-optimized.cl @@ -8,6 +8,7 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" @@ -15,105 +16,7 @@ #define md5apr1_magic0 0x72706124u #define md5apr1_magic1 0x00002431u -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -121,44 +24,45 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -225,7 +129,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -233,44 +137,47 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -337,44 +244,41 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m01600.cl b/OpenCL/m01600.cl index dc110a813..bdc7e5334 100644 --- a/OpenCL/m01600.cl +++ b/OpenCL/m01600.cl @@ -41,8 +41,6 @@ __kernel void m01600_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -54,8 +52,6 @@ __kernel void m01600_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -156,8 +152,6 @@ __kernel void m01600_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -169,8 +163,6 @@ __kernel void m01600_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01700_a0.cl b/OpenCL/m01700_a0.cl index 01649a999..72e3f0f2b 100644 --- a/OpenCL/m01700_a0.cl +++ b/OpenCL/m01700_a0.cl @@ -39,8 +39,6 @@ __kernel void m01700_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01700_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01700_a3.cl b/OpenCL/m01700_a3.cl index e35083c8b..80f66afb2 100644 --- a/OpenCL/m01700_a3.cl +++ b/OpenCL/m01700_a3.cl @@ -37,8 +37,6 @@ __kernel void m01700_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01700_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01710_a0.cl b/OpenCL/m01710_a0.cl index 593be6b7d..acd32a0f3 100644 --- a/OpenCL/m01710_a0.cl +++ b/OpenCL/m01710_a0.cl @@ -39,8 +39,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01710_a1.cl b/OpenCL/m01710_a1.cl index 485adbb32..b09354e90 100644 --- a/OpenCL/m01710_a1.cl +++ b/OpenCL/m01710_a1.cl @@ -37,8 +37,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01710_a3.cl b/OpenCL/m01710_a3.cl index 6da78255b..f54ce7355 100644 --- a/OpenCL/m01710_a3.cl +++ b/OpenCL/m01710_a3.cl @@ -37,8 +37,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01720_a0.cl b/OpenCL/m01720_a0.cl index adcf3501b..e80c9426c 100644 --- a/OpenCL/m01720_a0.cl +++ b/OpenCL/m01720_a0.cl @@ -39,8 +39,6 @@ __kernel void m01720_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01720_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01720_a3.cl b/OpenCL/m01720_a3.cl index e6bb100d3..b28688280 100644 --- a/OpenCL/m01720_a3.cl +++ b/OpenCL/m01720_a3.cl @@ -37,8 +37,6 @@ __kernel void m01720_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01720_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01730_a0.cl b/OpenCL/m01730_a0.cl index f4a8951a8..16a6753a7 100644 --- a/OpenCL/m01730_a0.cl +++ b/OpenCL/m01730_a0.cl @@ -39,8 +39,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01730_a1.cl b/OpenCL/m01730_a1.cl index 7529dd71d..a937af113 100644 --- a/OpenCL/m01730_a1.cl +++ b/OpenCL/m01730_a1.cl @@ -37,8 +37,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01730_a3.cl b/OpenCL/m01730_a3.cl index c408f8105..90d53fd56 100644 --- a/OpenCL/m01730_a3.cl +++ b/OpenCL/m01730_a3.cl @@ -37,8 +37,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -124,8 +120,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -137,8 +131,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01740_a0.cl b/OpenCL/m01740_a0.cl index d489d6e67..0c76ab222 100644 --- a/OpenCL/m01740_a0.cl +++ b/OpenCL/m01740_a0.cl @@ -39,8 +39,6 @@ __kernel void m01740_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01740_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01740_a3.cl b/OpenCL/m01740_a3.cl index 6ae09ef93..0e879be01 100644 --- a/OpenCL/m01740_a3.cl +++ b/OpenCL/m01740_a3.cl @@ -37,8 +37,6 @@ __kernel void m01740_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01740_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01750_a0.cl b/OpenCL/m01750_a0.cl index 0ff9c0346..b3e12bff9 100644 --- a/OpenCL/m01750_a0.cl +++ b/OpenCL/m01750_a0.cl @@ -39,8 +39,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01750_a1.cl b/OpenCL/m01750_a1.cl index 27be72d5e..430d9a7ce 100644 --- a/OpenCL/m01750_a1.cl +++ b/OpenCL/m01750_a1.cl @@ -37,8 +37,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -82,13 +78,13 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha512_hmac_ctx_vector_t ctx; + sha512_hmac_ctx_t ctx; - sha512_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha512_hmac_init (&ctx, c, pw_len + comb_len); - sha512_hmac_update_vector (&ctx, s, salt_len); + sha512_hmac_update (&ctx, s, salt_len); - sha512_hmac_final_vector (&ctx); + sha512_hmac_final (&ctx); const u32 r0 = l32_from_64_S (ctx.opad.h[7]); const u32 r1 = h32_from_64_S (ctx.opad.h[7]); @@ -135,8 +131,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -180,13 +172,13 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha512_hmac_ctx_vector_t ctx; + sha512_hmac_ctx_t ctx; - sha512_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha512_hmac_init (&ctx, c, pw_len + comb_len); - sha512_hmac_update_vector (&ctx, s, salt_len); + sha512_hmac_update (&ctx, s, salt_len); - sha512_hmac_final_vector (&ctx); + sha512_hmac_final (&ctx); const u32 r0 = l32_from_64_S (ctx.opad.h[7]); const u32 r1 = h32_from_64_S (ctx.opad.h[7]); diff --git a/OpenCL/m01750_a3.cl b/OpenCL/m01750_a3.cl index 28c6a4b38..a9c59d030 100644 --- a/OpenCL/m01750_a3.cl +++ b/OpenCL/m01750_a3.cl @@ -37,8 +37,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01760_a0.cl b/OpenCL/m01760_a0.cl index f977db7ac..460e1cb3f 100644 --- a/OpenCL/m01760_a0.cl +++ b/OpenCL/m01760_a0.cl @@ -39,8 +39,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; diff --git a/OpenCL/m01760_a1.cl b/OpenCL/m01760_a1.cl index 5403bcc5d..1a558b84f 100644 --- a/OpenCL/m01760_a1.cl +++ b/OpenCL/m01760_a1.cl @@ -37,8 +37,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; diff --git a/OpenCL/m01760_a3.cl b/OpenCL/m01760_a3.cl index f3b80a484..4bb4c0d78 100644 --- a/OpenCL/m01760_a3.cl +++ b/OpenCL/m01760_a3.cl @@ -37,8 +37,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m01800.cl b/OpenCL/m01800.cl index 46270e3dc..02360d45e 100644 --- a/OpenCL/m01800.cl +++ b/OpenCL/m01800.cl @@ -36,8 +36,6 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < pw_lenv; idx++) @@ -54,8 +52,6 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < salt_lenv; idx++) diff --git a/OpenCL/m02501.cl b/OpenCL/m02501.cl index d506bc389..dbfd507ba 100644 --- a/OpenCL/m02501.cl +++ b/OpenCL/m02501.cl @@ -17,12 +17,12 @@ #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -inline u8 hex_convert (const u8 c) +u8 hex_convert (const u8 c) { return (c & 15) + (c >> 6) * 9; } -inline u8 hex_to_u8 (const u8 hex[2]) +u8 hex_to_u8 (const u8 hex[2]) { u8 v = 0; diff --git a/OpenCL/m02610_a0.cl b/OpenCL/m02610_a0.cl index 633ea6f13..d86d026c8 100644 --- a/OpenCL/m02610_a0.cl +++ b/OpenCL/m02610_a0.cl @@ -69,21 +69,17 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -197,21 +193,17 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m02610_a1.cl b/OpenCL/m02610_a1.cl index 3818eb960..0cfd702fa 100644 --- a/OpenCL/m02610_a1.cl +++ b/OpenCL/m02610_a1.cl @@ -62,13 +62,11 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -179,13 +177,11 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m02610_a3.cl b/OpenCL/m02610_a3.cl index 442dfe070..441ec66fd 100644 --- a/OpenCL/m02610_a3.cl +++ b/OpenCL/m02610_a3.cl @@ -67,8 +67,6 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -201,8 +197,6 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -214,8 +208,6 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m02810_a0.cl b/OpenCL/m02810_a0.cl index 02dfa50fd..c1e047201 100644 --- a/OpenCL/m02810_a0.cl +++ b/OpenCL/m02810_a0.cl @@ -69,21 +69,17 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -220,21 +216,17 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m02810_a1.cl b/OpenCL/m02810_a1.cl index 28b4e271d..c14141814 100644 --- a/OpenCL/m02810_a1.cl +++ b/OpenCL/m02810_a1.cl @@ -62,13 +62,11 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -202,13 +200,11 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m02810_a3.cl b/OpenCL/m02810_a3.cl index 0a8eff53b..5cd8bbc94 100644 --- a/OpenCL/m02810_a3.cl +++ b/OpenCL/m02810_a3.cl @@ -25,7 +25,7 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -__kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32 *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -67,21 +67,17 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -170,7 +166,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32 *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -224,21 +220,17 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03000_a3.cl b/OpenCL/m03000_a3.cl index 9817c3828..26dea4196 100644 --- a/OpenCL/m03000_a3.cl +++ b/OpenCL/m03000_a3.cl @@ -19,7 +19,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL volatile +#define KXX_DECL #endif #ifdef IS_GENERIC @@ -898,11 +898,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -921,469 +921,556 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif @@ -1452,60 +1539,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 KXX_DECL u32 k36, k37, k38, k39, k40, k41; KXX_DECL u32 k42, k43, k44, k45, k46, k47; - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30); - s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17); - s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02); - s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18); - s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62); - s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49); - s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34); - s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50); - s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1599,8 +1632,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - - #endif } void transpose32c (u32 data[32]) @@ -1694,7 +1725,7 @@ void transpose32c (u32 data[32]) swap (data[30], data[31], 1, 0x55555555); } -void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * base @@ -2066,7 +2097,7 @@ void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __glo } } -void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * base @@ -2481,7 +2512,7 @@ __kernel void m03000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) } } -__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -2499,7 +2530,7 @@ __kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule m03000m (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m03710_a0.cl b/OpenCL/m03710_a0.cl index 054465f2d..1401ad716 100644 --- a/OpenCL/m03710_a0.cl +++ b/OpenCL/m03710_a0.cl @@ -69,21 +69,17 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -210,21 +206,17 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03710_a1.cl b/OpenCL/m03710_a1.cl index b74fa951f..18ed573ad 100644 --- a/OpenCL/m03710_a1.cl +++ b/OpenCL/m03710_a1.cl @@ -62,13 +62,11 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -192,13 +190,11 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03710_a3.cl b/OpenCL/m03710_a3.cl index 8c35c9263..e17da66f9 100644 --- a/OpenCL/m03710_a3.cl +++ b/OpenCL/m03710_a3.cl @@ -67,8 +67,6 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -214,8 +210,6 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -227,8 +221,6 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03800_a0.cl b/OpenCL/m03800_a0.cl index 42344114b..8608dc55c 100644 --- a/OpenCL/m03800_a0.cl +++ b/OpenCL/m03800_a0.cl @@ -39,8 +39,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -123,8 +119,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03800_a1.cl b/OpenCL/m03800_a1.cl index c594eee3d..31007631a 100644 --- a/OpenCL/m03800_a1.cl +++ b/OpenCL/m03800_a1.cl @@ -37,8 +37,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03800_a3.cl b/OpenCL/m03800_a3.cl index 00275dbeb..5736596bc 100644 --- a/OpenCL/m03800_a3.cl +++ b/OpenCL/m03800_a3.cl @@ -37,8 +37,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -129,8 +125,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -142,8 +136,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03910_a0.cl b/OpenCL/m03910_a0.cl index ab08e1a87..a296705c5 100644 --- a/OpenCL/m03910_a0.cl +++ b/OpenCL/m03910_a0.cl @@ -69,21 +69,17 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -220,21 +216,17 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03910_a1.cl b/OpenCL/m03910_a1.cl index c4186f01e..1a082b9df 100644 --- a/OpenCL/m03910_a1.cl +++ b/OpenCL/m03910_a1.cl @@ -62,13 +62,11 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -202,13 +200,11 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03910_a3.cl b/OpenCL/m03910_a3.cl index 48e7ae215..95ca4a3f3 100644 --- a/OpenCL/m03910_a3.cl +++ b/OpenCL/m03910_a3.cl @@ -67,8 +67,6 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -80,8 +78,6 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -224,8 +220,6 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -237,8 +231,6 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04010_a0.cl b/OpenCL/m04010_a0.cl index 454997290..e1365e632 100644 --- a/OpenCL/m04010_a0.cl +++ b/OpenCL/m04010_a0.cl @@ -69,8 +69,6 @@ __kernel void m04010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -197,8 +195,6 @@ __kernel void m04010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04010_a3.cl b/OpenCL/m04010_a3.cl index 4822b707b..91b54e48f 100644 --- a/OpenCL/m04010_a3.cl +++ b/OpenCL/m04010_a3.cl @@ -67,8 +67,6 @@ __kernel void m04010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -205,8 +203,6 @@ __kernel void m04010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04110_a0.cl b/OpenCL/m04110_a0.cl index 4bcb4def6..ed85497a3 100644 --- a/OpenCL/m04110_a0.cl +++ b/OpenCL/m04110_a0.cl @@ -69,8 +69,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -82,8 +80,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -214,8 +210,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -227,8 +221,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04110_a1.cl b/OpenCL/m04110_a1.cl index 57684942a..5bce915db 100644 --- a/OpenCL/m04110_a1.cl +++ b/OpenCL/m04110_a1.cl @@ -67,8 +67,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -201,8 +199,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04110_a3.cl b/OpenCL/m04110_a3.cl index f466a3178..57ec737f4 100644 --- a/OpenCL/m04110_a3.cl +++ b/OpenCL/m04110_a3.cl @@ -67,8 +67,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -220,8 +216,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -233,8 +227,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04310_a0.cl b/OpenCL/m04310_a0.cl index 8ff005fa0..6e2ac764e 100644 --- a/OpenCL/m04310_a0.cl +++ b/OpenCL/m04310_a0.cl @@ -69,21 +69,17 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -197,21 +193,17 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04310_a1.cl b/OpenCL/m04310_a1.cl index aea272f93..3ed91424c 100644 --- a/OpenCL/m04310_a1.cl +++ b/OpenCL/m04310_a1.cl @@ -62,13 +62,11 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -179,13 +177,11 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04310_a3.cl b/OpenCL/m04310_a3.cl index 2f491a2fb..e265cf4a1 100644 --- a/OpenCL/m04310_a3.cl +++ b/OpenCL/m04310_a3.cl @@ -67,8 +67,6 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -201,8 +197,6 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -214,8 +208,6 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04400_a0.cl b/OpenCL/m04400_a0.cl new file mode 100644 index 000000000..7d583393c --- /dev/null +++ b/OpenCL/m04400_a0.cl @@ -0,0 +1,249 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04400_a1.cl b/OpenCL/m04400_a1.cl new file mode 100644 index 000000000..190138933 --- /dev/null +++ b/OpenCL/m04400_a1.cl @@ -0,0 +1,229 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04400_a3.cl b/OpenCL/m04400_a3.cl new file mode 100644 index 000000000..8439a874d --- /dev/null +++ b/OpenCL/m04400_a3.cl @@ -0,0 +1,259 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04500_a0.cl b/OpenCL/m04500_a0.cl new file mode 100644 index 000000000..35d111c0e --- /dev/null +++ b/OpenCL/m04500_a0.cl @@ -0,0 +1,248 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04500_a1.cl b/OpenCL/m04500_a1.cl new file mode 100644 index 000000000..272495427 --- /dev/null +++ b/OpenCL/m04500_a1.cl @@ -0,0 +1,228 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04500_a3.cl b/OpenCL/m04500_a3.cl new file mode 100644 index 000000000..cea19b72f --- /dev/null +++ b/OpenCL/m04500_a3.cl @@ -0,0 +1,258 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04520_a0.cl b/OpenCL/m04520_a0.cl new file mode 100644 index 000000000..bf8924407 --- /dev/null +++ b/OpenCL/m04520_a0.cl @@ -0,0 +1,278 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04520_a1.cl b/OpenCL/m04520_a1.cl new file mode 100644 index 000000000..797380832 --- /dev/null +++ b/OpenCL/m04520_a1.cl @@ -0,0 +1,258 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04520_a3.cl b/OpenCL/m04520_a3.cl new file mode 100644 index 000000000..ff9a71e82 --- /dev/null +++ b/OpenCL/m04520_a3.cl @@ -0,0 +1,292 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + const u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + const u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04700_a0.cl b/OpenCL/m04700_a0.cl new file mode 100644 index 000000000..611834f5d --- /dev/null +++ b/OpenCL/m04700_a0.cl @@ -0,0 +1,241 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update (&ctx0, w, pw_len); + + md5_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update (&ctx0, w, pw_len); + + md5_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04700_a1.cl b/OpenCL/m04700_a1.cl new file mode 100644 index 000000000..9a1ab14a9 --- /dev/null +++ b/OpenCL/m04700_a1.cl @@ -0,0 +1,221 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04700_a3.cl b/OpenCL/m04700_a3.cl new file mode 100644 index 000000000..9211ec451 --- /dev/null +++ b/OpenCL/m04700_a3.cl @@ -0,0 +1,249 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx0; + + md5_init_vector (&ctx0); + + md5_update_vector (&ctx0, w, pw_len); + + md5_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx0; + + md5_init_vector (&ctx0); + + md5_update_vector (&ctx0, w, pw_len); + + md5_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04800_a0.cl b/OpenCL/m04800_a0.cl new file mode 100644 index 000000000..fea15ad37 --- /dev/null +++ b/OpenCL/m04800_a0.cl @@ -0,0 +1,164 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_update (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_update (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04800_a1.cl b/OpenCL/m04800_a1.cl new file mode 100644 index 000000000..b213e74c2 --- /dev/null +++ b/OpenCL/m04800_a1.cl @@ -0,0 +1,140 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_update (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_update (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04800_a3.cl b/OpenCL/m04800_a3.cl new file mode 100644 index 000000000..ed25c642c --- /dev/null +++ b/OpenCL/m04800_a3.cl @@ -0,0 +1,178 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32x s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32x s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04900_a0.cl b/OpenCL/m04900_a0.cl new file mode 100644 index 000000000..b1332a3de --- /dev/null +++ b/OpenCL/m04900_a0.cl @@ -0,0 +1,164 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04900_a1.cl b/OpenCL/m04900_a1.cl new file mode 100644 index 000000000..1a05735e3 --- /dev/null +++ b/OpenCL/m04900_a1.cl @@ -0,0 +1,140 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04900_a3.cl b/OpenCL/m04900_a3.cl new file mode 100644 index 000000000..3f9668d61 --- /dev/null +++ b/OpenCL/m04900_a3.cl @@ -0,0 +1,178 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05100_a0.cl b/OpenCL/m05100_a0.cl new file mode 100644 index 000000000..bbaf71b68 --- /dev/null +++ b/OpenCL/m05100_a0.cl @@ -0,0 +1,138 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_M_SCALAR (r0, r1, z, z); + COMPARE_M_SCALAR (r1, r2, z, z); + COMPARE_M_SCALAR (r2, r3, z, z); + } +} + +__kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + 0, + 0 + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_S_SCALAR (r0, r1, z, z); + COMPARE_S_SCALAR (r1, r2, z, z); + COMPARE_S_SCALAR (r2, r3, z, z); + } +} diff --git a/OpenCL/m05100_a1.cl b/OpenCL/m05100_a1.cl new file mode 100644 index 000000000..ead0e7190 --- /dev/null +++ b/OpenCL/m05100_a1.cl @@ -0,0 +1,119 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_M_SCALAR (r0, r1, z, z); + COMPARE_M_SCALAR (r1, r2, z, z); + COMPARE_M_SCALAR (r2, r3, z, z); + } +} + + +__kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + 0, + 0 + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_S_SCALAR (r0, r1, z, z); + COMPARE_S_SCALAR (r1, r2, z, z); + COMPARE_S_SCALAR (r2, r3, z, z); + } +} diff --git a/OpenCL/m05100_a3.cl b/OpenCL/m05100_a3.cl new file mode 100644 index 000000000..3fcb567c9 --- /dev/null +++ b/OpenCL/m05100_a3.cl @@ -0,0 +1,148 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + u32x z = 0; + + COMPARE_M_SIMD (r0, r1, z, z); + COMPARE_M_SIMD (r1, r2, z, z); + COMPARE_M_SIMD (r2, r3, z, z); + } +} + +__kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + 0, + 0 + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + u32x z = 0; + + COMPARE_S_SIMD (r0, r1, z, z); + COMPARE_S_SIMD (r1, r2, z, z); + COMPARE_S_SIMD (r2, r3, z, z); + } +} diff --git a/OpenCL/m05300_a0.cl b/OpenCL/m05300_a0.cl new file mode 100644 index 000000000..552785041 --- /dev/null +++ b/OpenCL/m05300_a0.cl @@ -0,0 +1,190 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05300_a1.cl b/OpenCL/m05300_a1.cl new file mode 100644 index 000000000..e96b61ba2 --- /dev/null +++ b/OpenCL/m05300_a1.cl @@ -0,0 +1,228 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = combs_buf[il_pos].i[idx]; + } + + switch_buffer_by_offset_1x64_le_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, c, pw_len + comb_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = combs_buf[il_pos].i[idx]; + } + + switch_buffer_by_offset_1x64_le_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, c, pw_len + comb_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05300_a3-optimized.cl b/OpenCL/m05300_a3-optimized.cl index 52e48240d..9ea662939 100644 --- a/OpenCL/m05300_a3-optimized.cl +++ b/OpenCL/m05300_a3-optimized.cl @@ -195,7 +195,7 @@ void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], md5_transform (w0, w1, w2, w3, digest); } -void m05300m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -254,20 +254,20 @@ void m05300m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 0]; + w0_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 1]; + w0_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 2]; + w0_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 3]; + w1_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 4]; + w1_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 5]; + w1_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 6]; + w1_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 7]; + w2_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 8]; + w2_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 9]; + w2_t[2] = ikepsk_bufs[digests_offset].nr_buf[10]; + w2_t[3] = ikepsk_bufs[digests_offset].nr_buf[11]; + w3_t[0] = ikepsk_bufs[digests_offset].nr_buf[12]; + w3_t[1] = ikepsk_bufs[digests_offset].nr_buf[13]; w3_t[2] = (64 + nr_len) * 8; w3_t[3] = 0; @@ -342,7 +342,7 @@ void m05300m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons } } -void m05300s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -413,20 +413,20 @@ void m05300s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 0]; + w0_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 1]; + w0_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 2]; + w0_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 3]; + w1_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 4]; + w1_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 5]; + w1_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 6]; + w1_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 7]; + w2_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 8]; + w2_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 9]; + w2_t[2] = ikepsk_bufs[digests_offset].nr_buf[10]; + w2_t[3] = ikepsk_bufs[digests_offset].nr_buf[11]; + w3_t[0] = ikepsk_bufs[digests_offset].nr_buf[12]; + w3_t[1] = ikepsk_bufs[digests_offset].nr_buf[13]; w3_t[2] = (64 + nr_len) * 8; w3_t[3] = 0; @@ -515,15 +515,6 @@ __kernel void m05300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -573,7 +564,7 @@ __kernel void m05300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -590,15 +581,6 @@ __kernel void m05300_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -648,7 +630,7 @@ __kernel void m05300_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -665,15 +647,6 @@ __kernel void m05300_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -723,7 +696,7 @@ __kernel void m05300_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -740,15 +713,6 @@ __kernel void m05300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -798,7 +762,7 @@ __kernel void m05300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -815,15 +779,6 @@ __kernel void m05300_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -873,7 +828,7 @@ __kernel void m05300_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -890,15 +845,6 @@ __kernel void m05300_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -948,5 +894,5 @@ __kernel void m05300_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } diff --git a/OpenCL/m05300_a3.cl b/OpenCL/m05300_a3.cl new file mode 100644 index 000000000..80891631c --- /dev/null +++ b/OpenCL/m05300_a3.cl @@ -0,0 +1,200 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05400_a0.cl b/OpenCL/m05400_a0.cl new file mode 100644 index 000000000..252e18f61 --- /dev/null +++ b/OpenCL/m05400_a0.cl @@ -0,0 +1,190 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init_swap (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init_swap (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05400_a1.cl b/OpenCL/m05400_a1.cl new file mode 100644 index 000000000..8aa75bef2 --- /dev/null +++ b/OpenCL/m05400_a1.cl @@ -0,0 +1,228 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, c, pw_len + comb_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, c, pw_len + comb_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05400_a3-optimized.cl b/OpenCL/m05400_a3-optimized.cl index 4ad39067c..80097b8f0 100644 --- a/OpenCL/m05400_a3-optimized.cl +++ b/OpenCL/m05400_a3-optimized.cl @@ -229,7 +229,7 @@ void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5] sha1_transform (w0, w1, w2, w3, digest); } -void m05400m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -288,20 +288,20 @@ void m05400m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 0]); + w0_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 1]); + w0_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 2]); + w0_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 3]); + w1_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 4]); + w1_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 5]); + w1_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 6]); + w1_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 7]); + w2_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 8]); + w2_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 9]); + w2_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[10]); + w2_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[11]); + w3_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[12]); + w3_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[13]); w3_t[2] = 0; w3_t[3] = (64 + nr_len) * 8; @@ -376,7 +376,7 @@ void m05400m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons } } -void m05400s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -447,20 +447,20 @@ void m05400s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 0]); + w0_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 1]); + w0_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 2]); + w0_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 3]); + w1_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 4]); + w1_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 5]); + w1_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 6]); + w1_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 7]); + w2_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 8]); + w2_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 9]); + w2_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[10]); + w2_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[11]); + w3_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[12]); + w3_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[13]); w3_t[2] = 0; w3_t[3] = (64 + nr_len) * 8; @@ -549,15 +549,6 @@ __kernel void m05400_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -607,7 +598,7 @@ __kernel void m05400_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -624,15 +615,6 @@ __kernel void m05400_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -682,7 +664,7 @@ __kernel void m05400_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -699,15 +681,6 @@ __kernel void m05400_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -757,7 +730,7 @@ __kernel void m05400_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -774,15 +747,6 @@ __kernel void m05400_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -832,7 +796,7 @@ __kernel void m05400_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -849,15 +813,6 @@ __kernel void m05400_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -907,7 +862,7 @@ __kernel void m05400_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -924,15 +879,6 @@ __kernel void m05400_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -982,5 +928,5 @@ __kernel void m05400_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } diff --git a/OpenCL/m05400_a3.cl b/OpenCL/m05400_a3.cl new file mode 100644 index 000000000..e40fd0cbd --- /dev/null +++ b/OpenCL/m05400_a3.cl @@ -0,0 +1,200 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32x r0 = ctx.opad.h[DGST_R0]; + const u32x r1 = ctx.opad.h[DGST_R1]; + const u32x r2 = ctx.opad.h[DGST_R2]; + const u32x r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32x r0 = ctx.opad.h[DGST_R0]; + const u32x r1 = ctx.opad.h[DGST_R1]; + const u32x r2 = ctx.opad.h[DGST_R2]; + const u32x r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05500_a0.cl b/OpenCL/m05500_a0.cl new file mode 100644 index 000000000..d10338f6c --- /dev/null +++ b/OpenCL/m05500_a0.cl @@ -0,0 +1,767 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" + + +#define PERM_OP(a,b,tt,n,m) \ +{ \ + tt = a >> n; \ + tt = tt ^ b; \ + tt = tt & m; \ + b = b ^ tt; \ + tt = tt << n; \ + a = a ^ tt; \ +} + +#define HPERM_OP(a,tt,n,m) \ +{ \ + tt = a << (16 + n); \ + tt = tt ^ a; \ + tt = tt & m; \ + a = a ^ tt; \ + tt = tt >> (16 + n); \ + a = a ^ tt; \ +} + +__constant u32a c_SPtrans[8][64] = +{ + { + 0x02080800, 0x00080000, 0x02000002, 0x02080802, + 0x02000000, 0x00080802, 0x00080002, 0x02000002, + 0x00080802, 0x02080800, 0x02080000, 0x00000802, + 0x02000802, 0x02000000, 0x00000000, 0x00080002, + 0x00080000, 0x00000002, 0x02000800, 0x00080800, + 0x02080802, 0x02080000, 0x00000802, 0x02000800, + 0x00000002, 0x00000800, 0x00080800, 0x02080002, + 0x00000800, 0x02000802, 0x02080002, 0x00000000, + 0x00000000, 0x02080802, 0x02000800, 0x00080002, + 0x02080800, 0x00080000, 0x00000802, 0x02000800, + 0x02080002, 0x00000800, 0x00080800, 0x02000002, + 0x00080802, 0x00000002, 0x02000002, 0x02080000, + 0x02080802, 0x00080800, 0x02080000, 0x02000802, + 0x02000000, 0x00000802, 0x00080002, 0x00000000, + 0x00080000, 0x02000000, 0x02000802, 0x02080800, + 0x00000002, 0x02080002, 0x00000800, 0x00080802, + }, + { + 0x40108010, 0x00000000, 0x00108000, 0x40100000, + 0x40000010, 0x00008010, 0x40008000, 0x00108000, + 0x00008000, 0x40100010, 0x00000010, 0x40008000, + 0x00100010, 0x40108000, 0x40100000, 0x00000010, + 0x00100000, 0x40008010, 0x40100010, 0x00008000, + 0x00108010, 0x40000000, 0x00000000, 0x00100010, + 0x40008010, 0x00108010, 0x40108000, 0x40000010, + 0x40000000, 0x00100000, 0x00008010, 0x40108010, + 0x00100010, 0x40108000, 0x40008000, 0x00108010, + 0x40108010, 0x00100010, 0x40000010, 0x00000000, + 0x40000000, 0x00008010, 0x00100000, 0x40100010, + 0x00008000, 0x40000000, 0x00108010, 0x40008010, + 0x40108000, 0x00008000, 0x00000000, 0x40000010, + 0x00000010, 0x40108010, 0x00108000, 0x40100000, + 0x40100010, 0x00100000, 0x00008010, 0x40008000, + 0x40008010, 0x00000010, 0x40100000, 0x00108000, + }, + { + 0x04000001, 0x04040100, 0x00000100, 0x04000101, + 0x00040001, 0x04000000, 0x04000101, 0x00040100, + 0x04000100, 0x00040000, 0x04040000, 0x00000001, + 0x04040101, 0x00000101, 0x00000001, 0x04040001, + 0x00000000, 0x00040001, 0x04040100, 0x00000100, + 0x00000101, 0x04040101, 0x00040000, 0x04000001, + 0x04040001, 0x04000100, 0x00040101, 0x04040000, + 0x00040100, 0x00000000, 0x04000000, 0x00040101, + 0x04040100, 0x00000100, 0x00000001, 0x00040000, + 0x00000101, 0x00040001, 0x04040000, 0x04000101, + 0x00000000, 0x04040100, 0x00040100, 0x04040001, + 0x00040001, 0x04000000, 0x04040101, 0x00000001, + 0x00040101, 0x04000001, 0x04000000, 0x04040101, + 0x00040000, 0x04000100, 0x04000101, 0x00040100, + 0x04000100, 0x00000000, 0x04040001, 0x00000101, + 0x04000001, 0x00040101, 0x00000100, 0x04040000, + }, + { + 0x00401008, 0x10001000, 0x00000008, 0x10401008, + 0x00000000, 0x10400000, 0x10001008, 0x00400008, + 0x10401000, 0x10000008, 0x10000000, 0x00001008, + 0x10000008, 0x00401008, 0x00400000, 0x10000000, + 0x10400008, 0x00401000, 0x00001000, 0x00000008, + 0x00401000, 0x10001008, 0x10400000, 0x00001000, + 0x00001008, 0x00000000, 0x00400008, 0x10401000, + 0x10001000, 0x10400008, 0x10401008, 0x00400000, + 0x10400008, 0x00001008, 0x00400000, 0x10000008, + 0x00401000, 0x10001000, 0x00000008, 0x10400000, + 0x10001008, 0x00000000, 0x00001000, 0x00400008, + 0x00000000, 0x10400008, 0x10401000, 0x00001000, + 0x10000000, 0x10401008, 0x00401008, 0x00400000, + 0x10401008, 0x00000008, 0x10001000, 0x00401008, + 0x00400008, 0x00401000, 0x10400000, 0x10001008, + 0x00001008, 0x10000000, 0x10000008, 0x10401000, + }, + { + 0x08000000, 0x00010000, 0x00000400, 0x08010420, + 0x08010020, 0x08000400, 0x00010420, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x00010400, + 0x08000420, 0x08010020, 0x08010400, 0x00000000, + 0x00010400, 0x08000000, 0x00010020, 0x00000420, + 0x08000400, 0x00010420, 0x00000000, 0x08000020, + 0x00000020, 0x08000420, 0x08010420, 0x00010020, + 0x08010000, 0x00000400, 0x00000420, 0x08010400, + 0x08010400, 0x08000420, 0x00010020, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x08000400, + 0x08000000, 0x00010400, 0x08010420, 0x00000000, + 0x00010420, 0x08000000, 0x00000400, 0x00010020, + 0x08000420, 0x00000400, 0x00000000, 0x08010420, + 0x08010020, 0x08010400, 0x00000420, 0x00010000, + 0x00010400, 0x08010020, 0x08000400, 0x00000420, + 0x00000020, 0x00010420, 0x08010000, 0x08000020, + }, + { + 0x80000040, 0x00200040, 0x00000000, 0x80202000, + 0x00200040, 0x00002000, 0x80002040, 0x00200000, + 0x00002040, 0x80202040, 0x00202000, 0x80000000, + 0x80002000, 0x80000040, 0x80200000, 0x00202040, + 0x00200000, 0x80002040, 0x80200040, 0x00000000, + 0x00002000, 0x00000040, 0x80202000, 0x80200040, + 0x80202040, 0x80200000, 0x80000000, 0x00002040, + 0x00000040, 0x00202000, 0x00202040, 0x80002000, + 0x00002040, 0x80000000, 0x80002000, 0x00202040, + 0x80202000, 0x00200040, 0x00000000, 0x80002000, + 0x80000000, 0x00002000, 0x80200040, 0x00200000, + 0x00200040, 0x80202040, 0x00202000, 0x00000040, + 0x80202040, 0x00202000, 0x00200000, 0x80002040, + 0x80000040, 0x80200000, 0x00202040, 0x00000000, + 0x00002000, 0x80000040, 0x80002040, 0x80202000, + 0x80200000, 0x00002040, 0x00000040, 0x80200040, + }, + { + 0x00004000, 0x00000200, 0x01000200, 0x01000004, + 0x01004204, 0x00004004, 0x00004200, 0x00000000, + 0x01000000, 0x01000204, 0x00000204, 0x01004000, + 0x00000004, 0x01004200, 0x01004000, 0x00000204, + 0x01000204, 0x00004000, 0x00004004, 0x01004204, + 0x00000000, 0x01000200, 0x01000004, 0x00004200, + 0x01004004, 0x00004204, 0x01004200, 0x00000004, + 0x00004204, 0x01004004, 0x00000200, 0x01000000, + 0x00004204, 0x01004000, 0x01004004, 0x00000204, + 0x00004000, 0x00000200, 0x01000000, 0x01004004, + 0x01000204, 0x00004204, 0x00004200, 0x00000000, + 0x00000200, 0x01000004, 0x00000004, 0x01000200, + 0x00000000, 0x01000204, 0x01000200, 0x00004200, + 0x00000204, 0x00004000, 0x01004204, 0x01000000, + 0x01004200, 0x00000004, 0x00004004, 0x01004204, + 0x01000004, 0x01004200, 0x01004000, 0x00004004, + }, + { + 0x20800080, 0x20820000, 0x00020080, 0x00000000, + 0x20020000, 0x00800080, 0x20800000, 0x20820080, + 0x00000080, 0x20000000, 0x00820000, 0x00020080, + 0x00820080, 0x20020080, 0x20000080, 0x20800000, + 0x00020000, 0x00820080, 0x00800080, 0x20020000, + 0x20820080, 0x20000080, 0x00000000, 0x00820000, + 0x20000000, 0x00800000, 0x20020080, 0x20800080, + 0x00800000, 0x00020000, 0x20820000, 0x00000080, + 0x00800000, 0x00020000, 0x20000080, 0x20820080, + 0x00020080, 0x20000000, 0x00000000, 0x00820000, + 0x20800080, 0x20020080, 0x20020000, 0x00800080, + 0x20820000, 0x00000080, 0x00800080, 0x20020000, + 0x20820080, 0x00800000, 0x20800000, 0x20000080, + 0x00820000, 0x00020080, 0x20020080, 0x20800000, + 0x00000080, 0x20820000, 0x00820080, 0x00000000, + 0x20000000, 0x20800080, 0x00020000, 0x00820080, + } +}; + +__constant u32a c_skb[8][64] = +{ + { + 0x00000000, 0x00000010, 0x20000000, 0x20000010, + 0x00010000, 0x00010010, 0x20010000, 0x20010010, + 0x00000800, 0x00000810, 0x20000800, 0x20000810, + 0x00010800, 0x00010810, 0x20010800, 0x20010810, + 0x00000020, 0x00000030, 0x20000020, 0x20000030, + 0x00010020, 0x00010030, 0x20010020, 0x20010030, + 0x00000820, 0x00000830, 0x20000820, 0x20000830, + 0x00010820, 0x00010830, 0x20010820, 0x20010830, + 0x00080000, 0x00080010, 0x20080000, 0x20080010, + 0x00090000, 0x00090010, 0x20090000, 0x20090010, + 0x00080800, 0x00080810, 0x20080800, 0x20080810, + 0x00090800, 0x00090810, 0x20090800, 0x20090810, + 0x00080020, 0x00080030, 0x20080020, 0x20080030, + 0x00090020, 0x00090030, 0x20090020, 0x20090030, + 0x00080820, 0x00080830, 0x20080820, 0x20080830, + 0x00090820, 0x00090830, 0x20090820, 0x20090830, + }, + { + 0x00000000, 0x02000000, 0x00002000, 0x02002000, + 0x00200000, 0x02200000, 0x00202000, 0x02202000, + 0x00000004, 0x02000004, 0x00002004, 0x02002004, + 0x00200004, 0x02200004, 0x00202004, 0x02202004, + 0x00000400, 0x02000400, 0x00002400, 0x02002400, + 0x00200400, 0x02200400, 0x00202400, 0x02202400, + 0x00000404, 0x02000404, 0x00002404, 0x02002404, + 0x00200404, 0x02200404, 0x00202404, 0x02202404, + 0x10000000, 0x12000000, 0x10002000, 0x12002000, + 0x10200000, 0x12200000, 0x10202000, 0x12202000, + 0x10000004, 0x12000004, 0x10002004, 0x12002004, + 0x10200004, 0x12200004, 0x10202004, 0x12202004, + 0x10000400, 0x12000400, 0x10002400, 0x12002400, + 0x10200400, 0x12200400, 0x10202400, 0x12202400, + 0x10000404, 0x12000404, 0x10002404, 0x12002404, + 0x10200404, 0x12200404, 0x10202404, 0x12202404, + }, + { + 0x00000000, 0x00000001, 0x00040000, 0x00040001, + 0x01000000, 0x01000001, 0x01040000, 0x01040001, + 0x00000002, 0x00000003, 0x00040002, 0x00040003, + 0x01000002, 0x01000003, 0x01040002, 0x01040003, + 0x00000200, 0x00000201, 0x00040200, 0x00040201, + 0x01000200, 0x01000201, 0x01040200, 0x01040201, + 0x00000202, 0x00000203, 0x00040202, 0x00040203, + 0x01000202, 0x01000203, 0x01040202, 0x01040203, + 0x08000000, 0x08000001, 0x08040000, 0x08040001, + 0x09000000, 0x09000001, 0x09040000, 0x09040001, + 0x08000002, 0x08000003, 0x08040002, 0x08040003, + 0x09000002, 0x09000003, 0x09040002, 0x09040003, + 0x08000200, 0x08000201, 0x08040200, 0x08040201, + 0x09000200, 0x09000201, 0x09040200, 0x09040201, + 0x08000202, 0x08000203, 0x08040202, 0x08040203, + 0x09000202, 0x09000203, 0x09040202, 0x09040203, + }, + { + 0x00000000, 0x00100000, 0x00000100, 0x00100100, + 0x00000008, 0x00100008, 0x00000108, 0x00100108, + 0x00001000, 0x00101000, 0x00001100, 0x00101100, + 0x00001008, 0x00101008, 0x00001108, 0x00101108, + 0x04000000, 0x04100000, 0x04000100, 0x04100100, + 0x04000008, 0x04100008, 0x04000108, 0x04100108, + 0x04001000, 0x04101000, 0x04001100, 0x04101100, + 0x04001008, 0x04101008, 0x04001108, 0x04101108, + 0x00020000, 0x00120000, 0x00020100, 0x00120100, + 0x00020008, 0x00120008, 0x00020108, 0x00120108, + 0x00021000, 0x00121000, 0x00021100, 0x00121100, + 0x00021008, 0x00121008, 0x00021108, 0x00121108, + 0x04020000, 0x04120000, 0x04020100, 0x04120100, + 0x04020008, 0x04120008, 0x04020108, 0x04120108, + 0x04021000, 0x04121000, 0x04021100, 0x04121100, + 0x04021008, 0x04121008, 0x04021108, 0x04121108, + }, + { + 0x00000000, 0x10000000, 0x00010000, 0x10010000, + 0x00000004, 0x10000004, 0x00010004, 0x10010004, + 0x20000000, 0x30000000, 0x20010000, 0x30010000, + 0x20000004, 0x30000004, 0x20010004, 0x30010004, + 0x00100000, 0x10100000, 0x00110000, 0x10110000, + 0x00100004, 0x10100004, 0x00110004, 0x10110004, + 0x20100000, 0x30100000, 0x20110000, 0x30110000, + 0x20100004, 0x30100004, 0x20110004, 0x30110004, + 0x00001000, 0x10001000, 0x00011000, 0x10011000, + 0x00001004, 0x10001004, 0x00011004, 0x10011004, + 0x20001000, 0x30001000, 0x20011000, 0x30011000, + 0x20001004, 0x30001004, 0x20011004, 0x30011004, + 0x00101000, 0x10101000, 0x00111000, 0x10111000, + 0x00101004, 0x10101004, 0x00111004, 0x10111004, + 0x20101000, 0x30101000, 0x20111000, 0x30111000, + 0x20101004, 0x30101004, 0x20111004, 0x30111004, + }, + { + 0x00000000, 0x08000000, 0x00000008, 0x08000008, + 0x00000400, 0x08000400, 0x00000408, 0x08000408, + 0x00020000, 0x08020000, 0x00020008, 0x08020008, + 0x00020400, 0x08020400, 0x00020408, 0x08020408, + 0x00000001, 0x08000001, 0x00000009, 0x08000009, + 0x00000401, 0x08000401, 0x00000409, 0x08000409, + 0x00020001, 0x08020001, 0x00020009, 0x08020009, + 0x00020401, 0x08020401, 0x00020409, 0x08020409, + 0x02000000, 0x0A000000, 0x02000008, 0x0A000008, + 0x02000400, 0x0A000400, 0x02000408, 0x0A000408, + 0x02020000, 0x0A020000, 0x02020008, 0x0A020008, + 0x02020400, 0x0A020400, 0x02020408, 0x0A020408, + 0x02000001, 0x0A000001, 0x02000009, 0x0A000009, + 0x02000401, 0x0A000401, 0x02000409, 0x0A000409, + 0x02020001, 0x0A020001, 0x02020009, 0x0A020009, + 0x02020401, 0x0A020401, 0x02020409, 0x0A020409, + }, + { + 0x00000000, 0x00000100, 0x00080000, 0x00080100, + 0x01000000, 0x01000100, 0x01080000, 0x01080100, + 0x00000010, 0x00000110, 0x00080010, 0x00080110, + 0x01000010, 0x01000110, 0x01080010, 0x01080110, + 0x00200000, 0x00200100, 0x00280000, 0x00280100, + 0x01200000, 0x01200100, 0x01280000, 0x01280100, + 0x00200010, 0x00200110, 0x00280010, 0x00280110, + 0x01200010, 0x01200110, 0x01280010, 0x01280110, + 0x00000200, 0x00000300, 0x00080200, 0x00080300, + 0x01000200, 0x01000300, 0x01080200, 0x01080300, + 0x00000210, 0x00000310, 0x00080210, 0x00080310, + 0x01000210, 0x01000310, 0x01080210, 0x01080310, + 0x00200200, 0x00200300, 0x00280200, 0x00280300, + 0x01200200, 0x01200300, 0x01280200, 0x01280300, + 0x00200210, 0x00200310, 0x00280210, 0x00280310, + 0x01200210, 0x01200310, 0x01280210, 0x01280310, + }, + { + 0x00000000, 0x04000000, 0x00040000, 0x04040000, + 0x00000002, 0x04000002, 0x00040002, 0x04040002, + 0x00002000, 0x04002000, 0x00042000, 0x04042000, + 0x00002002, 0x04002002, 0x00042002, 0x04042002, + 0x00000020, 0x04000020, 0x00040020, 0x04040020, + 0x00000022, 0x04000022, 0x00040022, 0x04040022, + 0x00002020, 0x04002020, 0x00042020, 0x04042020, + 0x00002022, 0x04002022, 0x00042022, 0x04042022, + 0x00000800, 0x04000800, 0x00040800, 0x04040800, + 0x00000802, 0x04000802, 0x00040802, 0x04040802, + 0x00002800, 0x04002800, 0x00042800, 0x04042800, + 0x00002802, 0x04002802, 0x00042802, 0x04042802, + 0x00000820, 0x04000820, 0x00040820, 0x04040820, + 0x00000822, 0x04000822, 0x00040822, 0x04040822, + 0x00002820, 0x04002820, 0x00042820, 0x04042820, + 0x00002822, 0x04002822, 0x00042822, 0x04042822 + } +}; + +#if VECT_SIZE == 1 +#define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) +#endif + +void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 (*s_SPtrans)[64]) +{ + u32 r = data[0]; + u32 l = data[1]; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i += 2) + { + u32 u; + u32 t; + + u = Kc[i + 0] ^ rotl32 (r, 30u); + t = Kd[i + 0] ^ rotl32 (r, 26u); + + l ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + + u = Kc[i + 1] ^ rotl32 (l, 30u); + t = Kd[i + 1] ^ rotl32 (l, 26u); + + r ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + } + + iv[0] = l; + iv[1] = r; +} + +void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 (*s_skb)[64]) +{ + u32 tt; + + PERM_OP (d, c, tt, 4, 0x0f0f0f0f); + HPERM_OP (c, tt, 2, 0xcccc0000); + HPERM_OP (d, tt, 2, 0xcccc0000); + PERM_OP (d, c, tt, 1, 0x55555555); + PERM_OP (c, d, tt, 8, 0x00ff00ff); + PERM_OP (d, c, tt, 1, 0x55555555); + + d = ((d & 0x000000ff) << 16) + | ((d & 0x0000ff00) << 0) + | ((d & 0x00ff0000) >> 16) + | ((c & 0xf0000000) >> 4); + + c = c & 0x0fffffff; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i++) + { + if ((i < 2) || (i == 8) || (i == 15)) + { + c = ((c >> 1) | (c << 27)); + d = ((d >> 1) | (d << 27)); + } + else + { + c = ((c >> 2) | (c << 26)); + d = ((d >> 2) | (d << 26)); + } + + c = c & 0x0fffffff; + d = d & 0x0fffffff; + + const u32 c00 = (c >> 0) & 0x0000003f; + const u32 c06 = (c >> 6) & 0x00383003; + const u32 c07 = (c >> 7) & 0x0000003c; + const u32 c13 = (c >> 13) & 0x0000060f; + const u32 c20 = (c >> 20) & 0x00000001; + + u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); + + const u32 d00 = (d >> 0) & 0x00003c3f; + const u32 d07 = (d >> 7) & 0x00003f03; + const u32 d21 = (d >> 21) & 0x0000000f; + const u32 d22 = (d >> 22) & 0x00000030; + + u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); + + Kc[i] = ((t << 16) | (s & 0x0000ffff)); + Kd[i] = ((s >> 16) | (t & 0xffff0000)); + } +} + +void transform_netntlmv1_key (const u32 w0, const u32 w1, u32 out[2]) +{ + u32 t[8]; + + t[0] = (w0 >> 0) & 0xff; + t[1] = (w0 >> 8) & 0xff; + t[2] = (w0 >> 16) & 0xff; + t[3] = (w0 >> 24) & 0xff; + t[4] = (w1 >> 0) & 0xff; + t[5] = (w1 >> 8) & 0xff; + t[6] = (w1 >> 16) & 0xff; + t[7] = (w1 >> 24) & 0xff; + + u32 k[8]; + + k[0] = (t[0] >> 0); + k[1] = (t[0] << 7) | (t[1] >> 1); + k[2] = (t[1] << 6) | (t[2] >> 2); + k[3] = (t[2] << 5) | (t[3] >> 3); + k[4] = (t[3] << 4) | (t[4] >> 4); + k[5] = (t[4] << 3) | (t[5] >> 5); + k[6] = (t[5] << 2) | (t[6] >> 6); + k[7] = (t[6] << 1); + + out[0] = ((k[0] & 0xff) << 0) + | ((k[1] & 0xff) << 8) + | ((k[2] & 0xff) << 16) + | ((k[3] & 0xff) << 24); + + out[1] = ((k[4] & 0xff) << 0) + | ((k[5] & 0xff) << 8) + | ((k[6] & 0xff) << 16) + | ((k[7] & 0xff) << 24); +} + +__kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = out2[0]; + const u32 r3 = out2[1]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + /* + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + */ + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = search[2]; + const u32 r3 = search[3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05500_a1.cl b/OpenCL/m05500_a1.cl new file mode 100644 index 000000000..be64670a1 --- /dev/null +++ b/OpenCL/m05500_a1.cl @@ -0,0 +1,746 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" + +#define PERM_OP(a,b,tt,n,m) \ +{ \ + tt = a >> n; \ + tt = tt ^ b; \ + tt = tt & m; \ + b = b ^ tt; \ + tt = tt << n; \ + a = a ^ tt; \ +} + +#define HPERM_OP(a,tt,n,m) \ +{ \ + tt = a << (16 + n); \ + tt = tt ^ a; \ + tt = tt & m; \ + a = a ^ tt; \ + tt = tt >> (16 + n); \ + a = a ^ tt; \ +} + +__constant u32a c_SPtrans[8][64] = +{ + { + 0x02080800, 0x00080000, 0x02000002, 0x02080802, + 0x02000000, 0x00080802, 0x00080002, 0x02000002, + 0x00080802, 0x02080800, 0x02080000, 0x00000802, + 0x02000802, 0x02000000, 0x00000000, 0x00080002, + 0x00080000, 0x00000002, 0x02000800, 0x00080800, + 0x02080802, 0x02080000, 0x00000802, 0x02000800, + 0x00000002, 0x00000800, 0x00080800, 0x02080002, + 0x00000800, 0x02000802, 0x02080002, 0x00000000, + 0x00000000, 0x02080802, 0x02000800, 0x00080002, + 0x02080800, 0x00080000, 0x00000802, 0x02000800, + 0x02080002, 0x00000800, 0x00080800, 0x02000002, + 0x00080802, 0x00000002, 0x02000002, 0x02080000, + 0x02080802, 0x00080800, 0x02080000, 0x02000802, + 0x02000000, 0x00000802, 0x00080002, 0x00000000, + 0x00080000, 0x02000000, 0x02000802, 0x02080800, + 0x00000002, 0x02080002, 0x00000800, 0x00080802, + }, + { + 0x40108010, 0x00000000, 0x00108000, 0x40100000, + 0x40000010, 0x00008010, 0x40008000, 0x00108000, + 0x00008000, 0x40100010, 0x00000010, 0x40008000, + 0x00100010, 0x40108000, 0x40100000, 0x00000010, + 0x00100000, 0x40008010, 0x40100010, 0x00008000, + 0x00108010, 0x40000000, 0x00000000, 0x00100010, + 0x40008010, 0x00108010, 0x40108000, 0x40000010, + 0x40000000, 0x00100000, 0x00008010, 0x40108010, + 0x00100010, 0x40108000, 0x40008000, 0x00108010, + 0x40108010, 0x00100010, 0x40000010, 0x00000000, + 0x40000000, 0x00008010, 0x00100000, 0x40100010, + 0x00008000, 0x40000000, 0x00108010, 0x40008010, + 0x40108000, 0x00008000, 0x00000000, 0x40000010, + 0x00000010, 0x40108010, 0x00108000, 0x40100000, + 0x40100010, 0x00100000, 0x00008010, 0x40008000, + 0x40008010, 0x00000010, 0x40100000, 0x00108000, + }, + { + 0x04000001, 0x04040100, 0x00000100, 0x04000101, + 0x00040001, 0x04000000, 0x04000101, 0x00040100, + 0x04000100, 0x00040000, 0x04040000, 0x00000001, + 0x04040101, 0x00000101, 0x00000001, 0x04040001, + 0x00000000, 0x00040001, 0x04040100, 0x00000100, + 0x00000101, 0x04040101, 0x00040000, 0x04000001, + 0x04040001, 0x04000100, 0x00040101, 0x04040000, + 0x00040100, 0x00000000, 0x04000000, 0x00040101, + 0x04040100, 0x00000100, 0x00000001, 0x00040000, + 0x00000101, 0x00040001, 0x04040000, 0x04000101, + 0x00000000, 0x04040100, 0x00040100, 0x04040001, + 0x00040001, 0x04000000, 0x04040101, 0x00000001, + 0x00040101, 0x04000001, 0x04000000, 0x04040101, + 0x00040000, 0x04000100, 0x04000101, 0x00040100, + 0x04000100, 0x00000000, 0x04040001, 0x00000101, + 0x04000001, 0x00040101, 0x00000100, 0x04040000, + }, + { + 0x00401008, 0x10001000, 0x00000008, 0x10401008, + 0x00000000, 0x10400000, 0x10001008, 0x00400008, + 0x10401000, 0x10000008, 0x10000000, 0x00001008, + 0x10000008, 0x00401008, 0x00400000, 0x10000000, + 0x10400008, 0x00401000, 0x00001000, 0x00000008, + 0x00401000, 0x10001008, 0x10400000, 0x00001000, + 0x00001008, 0x00000000, 0x00400008, 0x10401000, + 0x10001000, 0x10400008, 0x10401008, 0x00400000, + 0x10400008, 0x00001008, 0x00400000, 0x10000008, + 0x00401000, 0x10001000, 0x00000008, 0x10400000, + 0x10001008, 0x00000000, 0x00001000, 0x00400008, + 0x00000000, 0x10400008, 0x10401000, 0x00001000, + 0x10000000, 0x10401008, 0x00401008, 0x00400000, + 0x10401008, 0x00000008, 0x10001000, 0x00401008, + 0x00400008, 0x00401000, 0x10400000, 0x10001008, + 0x00001008, 0x10000000, 0x10000008, 0x10401000, + }, + { + 0x08000000, 0x00010000, 0x00000400, 0x08010420, + 0x08010020, 0x08000400, 0x00010420, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x00010400, + 0x08000420, 0x08010020, 0x08010400, 0x00000000, + 0x00010400, 0x08000000, 0x00010020, 0x00000420, + 0x08000400, 0x00010420, 0x00000000, 0x08000020, + 0x00000020, 0x08000420, 0x08010420, 0x00010020, + 0x08010000, 0x00000400, 0x00000420, 0x08010400, + 0x08010400, 0x08000420, 0x00010020, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x08000400, + 0x08000000, 0x00010400, 0x08010420, 0x00000000, + 0x00010420, 0x08000000, 0x00000400, 0x00010020, + 0x08000420, 0x00000400, 0x00000000, 0x08010420, + 0x08010020, 0x08010400, 0x00000420, 0x00010000, + 0x00010400, 0x08010020, 0x08000400, 0x00000420, + 0x00000020, 0x00010420, 0x08010000, 0x08000020, + }, + { + 0x80000040, 0x00200040, 0x00000000, 0x80202000, + 0x00200040, 0x00002000, 0x80002040, 0x00200000, + 0x00002040, 0x80202040, 0x00202000, 0x80000000, + 0x80002000, 0x80000040, 0x80200000, 0x00202040, + 0x00200000, 0x80002040, 0x80200040, 0x00000000, + 0x00002000, 0x00000040, 0x80202000, 0x80200040, + 0x80202040, 0x80200000, 0x80000000, 0x00002040, + 0x00000040, 0x00202000, 0x00202040, 0x80002000, + 0x00002040, 0x80000000, 0x80002000, 0x00202040, + 0x80202000, 0x00200040, 0x00000000, 0x80002000, + 0x80000000, 0x00002000, 0x80200040, 0x00200000, + 0x00200040, 0x80202040, 0x00202000, 0x00000040, + 0x80202040, 0x00202000, 0x00200000, 0x80002040, + 0x80000040, 0x80200000, 0x00202040, 0x00000000, + 0x00002000, 0x80000040, 0x80002040, 0x80202000, + 0x80200000, 0x00002040, 0x00000040, 0x80200040, + }, + { + 0x00004000, 0x00000200, 0x01000200, 0x01000004, + 0x01004204, 0x00004004, 0x00004200, 0x00000000, + 0x01000000, 0x01000204, 0x00000204, 0x01004000, + 0x00000004, 0x01004200, 0x01004000, 0x00000204, + 0x01000204, 0x00004000, 0x00004004, 0x01004204, + 0x00000000, 0x01000200, 0x01000004, 0x00004200, + 0x01004004, 0x00004204, 0x01004200, 0x00000004, + 0x00004204, 0x01004004, 0x00000200, 0x01000000, + 0x00004204, 0x01004000, 0x01004004, 0x00000204, + 0x00004000, 0x00000200, 0x01000000, 0x01004004, + 0x01000204, 0x00004204, 0x00004200, 0x00000000, + 0x00000200, 0x01000004, 0x00000004, 0x01000200, + 0x00000000, 0x01000204, 0x01000200, 0x00004200, + 0x00000204, 0x00004000, 0x01004204, 0x01000000, + 0x01004200, 0x00000004, 0x00004004, 0x01004204, + 0x01000004, 0x01004200, 0x01004000, 0x00004004, + }, + { + 0x20800080, 0x20820000, 0x00020080, 0x00000000, + 0x20020000, 0x00800080, 0x20800000, 0x20820080, + 0x00000080, 0x20000000, 0x00820000, 0x00020080, + 0x00820080, 0x20020080, 0x20000080, 0x20800000, + 0x00020000, 0x00820080, 0x00800080, 0x20020000, + 0x20820080, 0x20000080, 0x00000000, 0x00820000, + 0x20000000, 0x00800000, 0x20020080, 0x20800080, + 0x00800000, 0x00020000, 0x20820000, 0x00000080, + 0x00800000, 0x00020000, 0x20000080, 0x20820080, + 0x00020080, 0x20000000, 0x00000000, 0x00820000, + 0x20800080, 0x20020080, 0x20020000, 0x00800080, + 0x20820000, 0x00000080, 0x00800080, 0x20020000, + 0x20820080, 0x00800000, 0x20800000, 0x20000080, + 0x00820000, 0x00020080, 0x20020080, 0x20800000, + 0x00000080, 0x20820000, 0x00820080, 0x00000000, + 0x20000000, 0x20800080, 0x00020000, 0x00820080, + } +}; + +__constant u32a c_skb[8][64] = +{ + { + 0x00000000, 0x00000010, 0x20000000, 0x20000010, + 0x00010000, 0x00010010, 0x20010000, 0x20010010, + 0x00000800, 0x00000810, 0x20000800, 0x20000810, + 0x00010800, 0x00010810, 0x20010800, 0x20010810, + 0x00000020, 0x00000030, 0x20000020, 0x20000030, + 0x00010020, 0x00010030, 0x20010020, 0x20010030, + 0x00000820, 0x00000830, 0x20000820, 0x20000830, + 0x00010820, 0x00010830, 0x20010820, 0x20010830, + 0x00080000, 0x00080010, 0x20080000, 0x20080010, + 0x00090000, 0x00090010, 0x20090000, 0x20090010, + 0x00080800, 0x00080810, 0x20080800, 0x20080810, + 0x00090800, 0x00090810, 0x20090800, 0x20090810, + 0x00080020, 0x00080030, 0x20080020, 0x20080030, + 0x00090020, 0x00090030, 0x20090020, 0x20090030, + 0x00080820, 0x00080830, 0x20080820, 0x20080830, + 0x00090820, 0x00090830, 0x20090820, 0x20090830, + }, + { + 0x00000000, 0x02000000, 0x00002000, 0x02002000, + 0x00200000, 0x02200000, 0x00202000, 0x02202000, + 0x00000004, 0x02000004, 0x00002004, 0x02002004, + 0x00200004, 0x02200004, 0x00202004, 0x02202004, + 0x00000400, 0x02000400, 0x00002400, 0x02002400, + 0x00200400, 0x02200400, 0x00202400, 0x02202400, + 0x00000404, 0x02000404, 0x00002404, 0x02002404, + 0x00200404, 0x02200404, 0x00202404, 0x02202404, + 0x10000000, 0x12000000, 0x10002000, 0x12002000, + 0x10200000, 0x12200000, 0x10202000, 0x12202000, + 0x10000004, 0x12000004, 0x10002004, 0x12002004, + 0x10200004, 0x12200004, 0x10202004, 0x12202004, + 0x10000400, 0x12000400, 0x10002400, 0x12002400, + 0x10200400, 0x12200400, 0x10202400, 0x12202400, + 0x10000404, 0x12000404, 0x10002404, 0x12002404, + 0x10200404, 0x12200404, 0x10202404, 0x12202404, + }, + { + 0x00000000, 0x00000001, 0x00040000, 0x00040001, + 0x01000000, 0x01000001, 0x01040000, 0x01040001, + 0x00000002, 0x00000003, 0x00040002, 0x00040003, + 0x01000002, 0x01000003, 0x01040002, 0x01040003, + 0x00000200, 0x00000201, 0x00040200, 0x00040201, + 0x01000200, 0x01000201, 0x01040200, 0x01040201, + 0x00000202, 0x00000203, 0x00040202, 0x00040203, + 0x01000202, 0x01000203, 0x01040202, 0x01040203, + 0x08000000, 0x08000001, 0x08040000, 0x08040001, + 0x09000000, 0x09000001, 0x09040000, 0x09040001, + 0x08000002, 0x08000003, 0x08040002, 0x08040003, + 0x09000002, 0x09000003, 0x09040002, 0x09040003, + 0x08000200, 0x08000201, 0x08040200, 0x08040201, + 0x09000200, 0x09000201, 0x09040200, 0x09040201, + 0x08000202, 0x08000203, 0x08040202, 0x08040203, + 0x09000202, 0x09000203, 0x09040202, 0x09040203, + }, + { + 0x00000000, 0x00100000, 0x00000100, 0x00100100, + 0x00000008, 0x00100008, 0x00000108, 0x00100108, + 0x00001000, 0x00101000, 0x00001100, 0x00101100, + 0x00001008, 0x00101008, 0x00001108, 0x00101108, + 0x04000000, 0x04100000, 0x04000100, 0x04100100, + 0x04000008, 0x04100008, 0x04000108, 0x04100108, + 0x04001000, 0x04101000, 0x04001100, 0x04101100, + 0x04001008, 0x04101008, 0x04001108, 0x04101108, + 0x00020000, 0x00120000, 0x00020100, 0x00120100, + 0x00020008, 0x00120008, 0x00020108, 0x00120108, + 0x00021000, 0x00121000, 0x00021100, 0x00121100, + 0x00021008, 0x00121008, 0x00021108, 0x00121108, + 0x04020000, 0x04120000, 0x04020100, 0x04120100, + 0x04020008, 0x04120008, 0x04020108, 0x04120108, + 0x04021000, 0x04121000, 0x04021100, 0x04121100, + 0x04021008, 0x04121008, 0x04021108, 0x04121108, + }, + { + 0x00000000, 0x10000000, 0x00010000, 0x10010000, + 0x00000004, 0x10000004, 0x00010004, 0x10010004, + 0x20000000, 0x30000000, 0x20010000, 0x30010000, + 0x20000004, 0x30000004, 0x20010004, 0x30010004, + 0x00100000, 0x10100000, 0x00110000, 0x10110000, + 0x00100004, 0x10100004, 0x00110004, 0x10110004, + 0x20100000, 0x30100000, 0x20110000, 0x30110000, + 0x20100004, 0x30100004, 0x20110004, 0x30110004, + 0x00001000, 0x10001000, 0x00011000, 0x10011000, + 0x00001004, 0x10001004, 0x00011004, 0x10011004, + 0x20001000, 0x30001000, 0x20011000, 0x30011000, + 0x20001004, 0x30001004, 0x20011004, 0x30011004, + 0x00101000, 0x10101000, 0x00111000, 0x10111000, + 0x00101004, 0x10101004, 0x00111004, 0x10111004, + 0x20101000, 0x30101000, 0x20111000, 0x30111000, + 0x20101004, 0x30101004, 0x20111004, 0x30111004, + }, + { + 0x00000000, 0x08000000, 0x00000008, 0x08000008, + 0x00000400, 0x08000400, 0x00000408, 0x08000408, + 0x00020000, 0x08020000, 0x00020008, 0x08020008, + 0x00020400, 0x08020400, 0x00020408, 0x08020408, + 0x00000001, 0x08000001, 0x00000009, 0x08000009, + 0x00000401, 0x08000401, 0x00000409, 0x08000409, + 0x00020001, 0x08020001, 0x00020009, 0x08020009, + 0x00020401, 0x08020401, 0x00020409, 0x08020409, + 0x02000000, 0x0A000000, 0x02000008, 0x0A000008, + 0x02000400, 0x0A000400, 0x02000408, 0x0A000408, + 0x02020000, 0x0A020000, 0x02020008, 0x0A020008, + 0x02020400, 0x0A020400, 0x02020408, 0x0A020408, + 0x02000001, 0x0A000001, 0x02000009, 0x0A000009, + 0x02000401, 0x0A000401, 0x02000409, 0x0A000409, + 0x02020001, 0x0A020001, 0x02020009, 0x0A020009, + 0x02020401, 0x0A020401, 0x02020409, 0x0A020409, + }, + { + 0x00000000, 0x00000100, 0x00080000, 0x00080100, + 0x01000000, 0x01000100, 0x01080000, 0x01080100, + 0x00000010, 0x00000110, 0x00080010, 0x00080110, + 0x01000010, 0x01000110, 0x01080010, 0x01080110, + 0x00200000, 0x00200100, 0x00280000, 0x00280100, + 0x01200000, 0x01200100, 0x01280000, 0x01280100, + 0x00200010, 0x00200110, 0x00280010, 0x00280110, + 0x01200010, 0x01200110, 0x01280010, 0x01280110, + 0x00000200, 0x00000300, 0x00080200, 0x00080300, + 0x01000200, 0x01000300, 0x01080200, 0x01080300, + 0x00000210, 0x00000310, 0x00080210, 0x00080310, + 0x01000210, 0x01000310, 0x01080210, 0x01080310, + 0x00200200, 0x00200300, 0x00280200, 0x00280300, + 0x01200200, 0x01200300, 0x01280200, 0x01280300, + 0x00200210, 0x00200310, 0x00280210, 0x00280310, + 0x01200210, 0x01200310, 0x01280210, 0x01280310, + }, + { + 0x00000000, 0x04000000, 0x00040000, 0x04040000, + 0x00000002, 0x04000002, 0x00040002, 0x04040002, + 0x00002000, 0x04002000, 0x00042000, 0x04042000, + 0x00002002, 0x04002002, 0x00042002, 0x04042002, + 0x00000020, 0x04000020, 0x00040020, 0x04040020, + 0x00000022, 0x04000022, 0x00040022, 0x04040022, + 0x00002020, 0x04002020, 0x00042020, 0x04042020, + 0x00002022, 0x04002022, 0x00042022, 0x04042022, + 0x00000800, 0x04000800, 0x00040800, 0x04040800, + 0x00000802, 0x04000802, 0x00040802, 0x04040802, + 0x00002800, 0x04002800, 0x00042800, 0x04042800, + 0x00002802, 0x04002802, 0x00042802, 0x04042802, + 0x00000820, 0x04000820, 0x00040820, 0x04040820, + 0x00000822, 0x04000822, 0x00040822, 0x04040822, + 0x00002820, 0x04002820, 0x00042820, 0x04042820, + 0x00002822, 0x04002822, 0x00042822, 0x04042822 + } +}; + +#if VECT_SIZE == 1 +#define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) +#endif + +void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 (*s_SPtrans)[64]) +{ + u32 r = data[0]; + u32 l = data[1]; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i += 2) + { + u32 u; + u32 t; + + u = Kc[i + 0] ^ rotl32 (r, 30u); + t = Kd[i + 0] ^ rotl32 (r, 26u); + + l ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + + u = Kc[i + 1] ^ rotl32 (l, 30u); + t = Kd[i + 1] ^ rotl32 (l, 26u); + + r ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + } + + iv[0] = l; + iv[1] = r; +} + +void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 (*s_skb)[64]) +{ + u32 tt; + + PERM_OP (d, c, tt, 4, 0x0f0f0f0f); + HPERM_OP (c, tt, 2, 0xcccc0000); + HPERM_OP (d, tt, 2, 0xcccc0000); + PERM_OP (d, c, tt, 1, 0x55555555); + PERM_OP (c, d, tt, 8, 0x00ff00ff); + PERM_OP (d, c, tt, 1, 0x55555555); + + d = ((d & 0x000000ff) << 16) + | ((d & 0x0000ff00) << 0) + | ((d & 0x00ff0000) >> 16) + | ((c & 0xf0000000) >> 4); + + c = c & 0x0fffffff; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i++) + { + if ((i < 2) || (i == 8) || (i == 15)) + { + c = ((c >> 1) | (c << 27)); + d = ((d >> 1) | (d << 27)); + } + else + { + c = ((c >> 2) | (c << 26)); + d = ((d >> 2) | (d << 26)); + } + + c = c & 0x0fffffff; + d = d & 0x0fffffff; + + const u32 c00 = (c >> 0) & 0x0000003f; + const u32 c06 = (c >> 6) & 0x00383003; + const u32 c07 = (c >> 7) & 0x0000003c; + const u32 c13 = (c >> 13) & 0x0000060f; + const u32 c20 = (c >> 20) & 0x00000001; + + u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); + + const u32 d00 = (d >> 0) & 0x00003c3f; + const u32 d07 = (d >> 7) & 0x00003f03; + const u32 d21 = (d >> 21) & 0x0000000f; + const u32 d22 = (d >> 22) & 0x00000030; + + u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); + + Kc[i] = ((t << 16) | (s & 0x0000ffff)); + Kd[i] = ((s >> 16) | (t & 0xffff0000)); + } +} + +void transform_netntlmv1_key (const u32 w0, const u32 w1, u32 out[2]) +{ + u32 t[8]; + + t[0] = (w0 >> 0) & 0xff; + t[1] = (w0 >> 8) & 0xff; + t[2] = (w0 >> 16) & 0xff; + t[3] = (w0 >> 24) & 0xff; + t[4] = (w1 >> 0) & 0xff; + t[5] = (w1 >> 8) & 0xff; + t[6] = (w1 >> 16) & 0xff; + t[7] = (w1 >> 24) & 0xff; + + u32 k[8]; + + k[0] = (t[0] >> 0); + k[1] = (t[0] << 7) | (t[1] >> 1); + k[2] = (t[1] << 6) | (t[2] >> 2); + k[3] = (t[2] << 5) | (t[3] >> 3); + k[4] = (t[3] << 4) | (t[4] >> 4); + k[5] = (t[4] << 3) | (t[5] >> 5); + k[6] = (t[5] << 2) | (t[6] >> 6); + k[7] = (t[6] << 1); + + out[0] = ((k[0] & 0xff) << 0) + | ((k[1] & 0xff) << 8) + | ((k[2] & 0xff) << 16) + | ((k[3] & 0xff) << 24); + + out[1] = ((k[4] & 0xff) << 0) + | ((k[5] & 0xff) << 8) + | ((k[6] & 0xff) << 16) + | ((k[7] & 0xff) << 24); +} + +__kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = out2[0]; + const u32 r3 = out2[1]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + /* + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + */ + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = search[2]; + const u32 r3 = search[3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05500_a3.cl b/OpenCL/m05500_a3.cl new file mode 100644 index 000000000..192e3c2a7 --- /dev/null +++ b/OpenCL/m05500_a3.cl @@ -0,0 +1,776 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md4.cl" + +#define PERM_OP(a,b,tt,n,m) \ +{ \ + tt = a >> n; \ + tt = tt ^ b; \ + tt = tt & m; \ + b = b ^ tt; \ + tt = tt << n; \ + a = a ^ tt; \ +} + +#define HPERM_OP(a,tt,n,m) \ +{ \ + tt = a << (16 + n); \ + tt = tt ^ a; \ + tt = tt & m; \ + a = a ^ tt; \ + tt = tt >> (16 + n); \ + a = a ^ tt; \ +} + +__constant u32a c_SPtrans[8][64] = +{ + { + 0x02080800, 0x00080000, 0x02000002, 0x02080802, + 0x02000000, 0x00080802, 0x00080002, 0x02000002, + 0x00080802, 0x02080800, 0x02080000, 0x00000802, + 0x02000802, 0x02000000, 0x00000000, 0x00080002, + 0x00080000, 0x00000002, 0x02000800, 0x00080800, + 0x02080802, 0x02080000, 0x00000802, 0x02000800, + 0x00000002, 0x00000800, 0x00080800, 0x02080002, + 0x00000800, 0x02000802, 0x02080002, 0x00000000, + 0x00000000, 0x02080802, 0x02000800, 0x00080002, + 0x02080800, 0x00080000, 0x00000802, 0x02000800, + 0x02080002, 0x00000800, 0x00080800, 0x02000002, + 0x00080802, 0x00000002, 0x02000002, 0x02080000, + 0x02080802, 0x00080800, 0x02080000, 0x02000802, + 0x02000000, 0x00000802, 0x00080002, 0x00000000, + 0x00080000, 0x02000000, 0x02000802, 0x02080800, + 0x00000002, 0x02080002, 0x00000800, 0x00080802, + }, + { + 0x40108010, 0x00000000, 0x00108000, 0x40100000, + 0x40000010, 0x00008010, 0x40008000, 0x00108000, + 0x00008000, 0x40100010, 0x00000010, 0x40008000, + 0x00100010, 0x40108000, 0x40100000, 0x00000010, + 0x00100000, 0x40008010, 0x40100010, 0x00008000, + 0x00108010, 0x40000000, 0x00000000, 0x00100010, + 0x40008010, 0x00108010, 0x40108000, 0x40000010, + 0x40000000, 0x00100000, 0x00008010, 0x40108010, + 0x00100010, 0x40108000, 0x40008000, 0x00108010, + 0x40108010, 0x00100010, 0x40000010, 0x00000000, + 0x40000000, 0x00008010, 0x00100000, 0x40100010, + 0x00008000, 0x40000000, 0x00108010, 0x40008010, + 0x40108000, 0x00008000, 0x00000000, 0x40000010, + 0x00000010, 0x40108010, 0x00108000, 0x40100000, + 0x40100010, 0x00100000, 0x00008010, 0x40008000, + 0x40008010, 0x00000010, 0x40100000, 0x00108000, + }, + { + 0x04000001, 0x04040100, 0x00000100, 0x04000101, + 0x00040001, 0x04000000, 0x04000101, 0x00040100, + 0x04000100, 0x00040000, 0x04040000, 0x00000001, + 0x04040101, 0x00000101, 0x00000001, 0x04040001, + 0x00000000, 0x00040001, 0x04040100, 0x00000100, + 0x00000101, 0x04040101, 0x00040000, 0x04000001, + 0x04040001, 0x04000100, 0x00040101, 0x04040000, + 0x00040100, 0x00000000, 0x04000000, 0x00040101, + 0x04040100, 0x00000100, 0x00000001, 0x00040000, + 0x00000101, 0x00040001, 0x04040000, 0x04000101, + 0x00000000, 0x04040100, 0x00040100, 0x04040001, + 0x00040001, 0x04000000, 0x04040101, 0x00000001, + 0x00040101, 0x04000001, 0x04000000, 0x04040101, + 0x00040000, 0x04000100, 0x04000101, 0x00040100, + 0x04000100, 0x00000000, 0x04040001, 0x00000101, + 0x04000001, 0x00040101, 0x00000100, 0x04040000, + }, + { + 0x00401008, 0x10001000, 0x00000008, 0x10401008, + 0x00000000, 0x10400000, 0x10001008, 0x00400008, + 0x10401000, 0x10000008, 0x10000000, 0x00001008, + 0x10000008, 0x00401008, 0x00400000, 0x10000000, + 0x10400008, 0x00401000, 0x00001000, 0x00000008, + 0x00401000, 0x10001008, 0x10400000, 0x00001000, + 0x00001008, 0x00000000, 0x00400008, 0x10401000, + 0x10001000, 0x10400008, 0x10401008, 0x00400000, + 0x10400008, 0x00001008, 0x00400000, 0x10000008, + 0x00401000, 0x10001000, 0x00000008, 0x10400000, + 0x10001008, 0x00000000, 0x00001000, 0x00400008, + 0x00000000, 0x10400008, 0x10401000, 0x00001000, + 0x10000000, 0x10401008, 0x00401008, 0x00400000, + 0x10401008, 0x00000008, 0x10001000, 0x00401008, + 0x00400008, 0x00401000, 0x10400000, 0x10001008, + 0x00001008, 0x10000000, 0x10000008, 0x10401000, + }, + { + 0x08000000, 0x00010000, 0x00000400, 0x08010420, + 0x08010020, 0x08000400, 0x00010420, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x00010400, + 0x08000420, 0x08010020, 0x08010400, 0x00000000, + 0x00010400, 0x08000000, 0x00010020, 0x00000420, + 0x08000400, 0x00010420, 0x00000000, 0x08000020, + 0x00000020, 0x08000420, 0x08010420, 0x00010020, + 0x08010000, 0x00000400, 0x00000420, 0x08010400, + 0x08010400, 0x08000420, 0x00010020, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x08000400, + 0x08000000, 0x00010400, 0x08010420, 0x00000000, + 0x00010420, 0x08000000, 0x00000400, 0x00010020, + 0x08000420, 0x00000400, 0x00000000, 0x08010420, + 0x08010020, 0x08010400, 0x00000420, 0x00010000, + 0x00010400, 0x08010020, 0x08000400, 0x00000420, + 0x00000020, 0x00010420, 0x08010000, 0x08000020, + }, + { + 0x80000040, 0x00200040, 0x00000000, 0x80202000, + 0x00200040, 0x00002000, 0x80002040, 0x00200000, + 0x00002040, 0x80202040, 0x00202000, 0x80000000, + 0x80002000, 0x80000040, 0x80200000, 0x00202040, + 0x00200000, 0x80002040, 0x80200040, 0x00000000, + 0x00002000, 0x00000040, 0x80202000, 0x80200040, + 0x80202040, 0x80200000, 0x80000000, 0x00002040, + 0x00000040, 0x00202000, 0x00202040, 0x80002000, + 0x00002040, 0x80000000, 0x80002000, 0x00202040, + 0x80202000, 0x00200040, 0x00000000, 0x80002000, + 0x80000000, 0x00002000, 0x80200040, 0x00200000, + 0x00200040, 0x80202040, 0x00202000, 0x00000040, + 0x80202040, 0x00202000, 0x00200000, 0x80002040, + 0x80000040, 0x80200000, 0x00202040, 0x00000000, + 0x00002000, 0x80000040, 0x80002040, 0x80202000, + 0x80200000, 0x00002040, 0x00000040, 0x80200040, + }, + { + 0x00004000, 0x00000200, 0x01000200, 0x01000004, + 0x01004204, 0x00004004, 0x00004200, 0x00000000, + 0x01000000, 0x01000204, 0x00000204, 0x01004000, + 0x00000004, 0x01004200, 0x01004000, 0x00000204, + 0x01000204, 0x00004000, 0x00004004, 0x01004204, + 0x00000000, 0x01000200, 0x01000004, 0x00004200, + 0x01004004, 0x00004204, 0x01004200, 0x00000004, + 0x00004204, 0x01004004, 0x00000200, 0x01000000, + 0x00004204, 0x01004000, 0x01004004, 0x00000204, + 0x00004000, 0x00000200, 0x01000000, 0x01004004, + 0x01000204, 0x00004204, 0x00004200, 0x00000000, + 0x00000200, 0x01000004, 0x00000004, 0x01000200, + 0x00000000, 0x01000204, 0x01000200, 0x00004200, + 0x00000204, 0x00004000, 0x01004204, 0x01000000, + 0x01004200, 0x00000004, 0x00004004, 0x01004204, + 0x01000004, 0x01004200, 0x01004000, 0x00004004, + }, + { + 0x20800080, 0x20820000, 0x00020080, 0x00000000, + 0x20020000, 0x00800080, 0x20800000, 0x20820080, + 0x00000080, 0x20000000, 0x00820000, 0x00020080, + 0x00820080, 0x20020080, 0x20000080, 0x20800000, + 0x00020000, 0x00820080, 0x00800080, 0x20020000, + 0x20820080, 0x20000080, 0x00000000, 0x00820000, + 0x20000000, 0x00800000, 0x20020080, 0x20800080, + 0x00800000, 0x00020000, 0x20820000, 0x00000080, + 0x00800000, 0x00020000, 0x20000080, 0x20820080, + 0x00020080, 0x20000000, 0x00000000, 0x00820000, + 0x20800080, 0x20020080, 0x20020000, 0x00800080, + 0x20820000, 0x00000080, 0x00800080, 0x20020000, + 0x20820080, 0x00800000, 0x20800000, 0x20000080, + 0x00820000, 0x00020080, 0x20020080, 0x20800000, + 0x00000080, 0x20820000, 0x00820080, 0x00000000, + 0x20000000, 0x20800080, 0x00020000, 0x00820080, + } +}; + +__constant u32a c_skb[8][64] = +{ + { + 0x00000000, 0x00000010, 0x20000000, 0x20000010, + 0x00010000, 0x00010010, 0x20010000, 0x20010010, + 0x00000800, 0x00000810, 0x20000800, 0x20000810, + 0x00010800, 0x00010810, 0x20010800, 0x20010810, + 0x00000020, 0x00000030, 0x20000020, 0x20000030, + 0x00010020, 0x00010030, 0x20010020, 0x20010030, + 0x00000820, 0x00000830, 0x20000820, 0x20000830, + 0x00010820, 0x00010830, 0x20010820, 0x20010830, + 0x00080000, 0x00080010, 0x20080000, 0x20080010, + 0x00090000, 0x00090010, 0x20090000, 0x20090010, + 0x00080800, 0x00080810, 0x20080800, 0x20080810, + 0x00090800, 0x00090810, 0x20090800, 0x20090810, + 0x00080020, 0x00080030, 0x20080020, 0x20080030, + 0x00090020, 0x00090030, 0x20090020, 0x20090030, + 0x00080820, 0x00080830, 0x20080820, 0x20080830, + 0x00090820, 0x00090830, 0x20090820, 0x20090830, + }, + { + 0x00000000, 0x02000000, 0x00002000, 0x02002000, + 0x00200000, 0x02200000, 0x00202000, 0x02202000, + 0x00000004, 0x02000004, 0x00002004, 0x02002004, + 0x00200004, 0x02200004, 0x00202004, 0x02202004, + 0x00000400, 0x02000400, 0x00002400, 0x02002400, + 0x00200400, 0x02200400, 0x00202400, 0x02202400, + 0x00000404, 0x02000404, 0x00002404, 0x02002404, + 0x00200404, 0x02200404, 0x00202404, 0x02202404, + 0x10000000, 0x12000000, 0x10002000, 0x12002000, + 0x10200000, 0x12200000, 0x10202000, 0x12202000, + 0x10000004, 0x12000004, 0x10002004, 0x12002004, + 0x10200004, 0x12200004, 0x10202004, 0x12202004, + 0x10000400, 0x12000400, 0x10002400, 0x12002400, + 0x10200400, 0x12200400, 0x10202400, 0x12202400, + 0x10000404, 0x12000404, 0x10002404, 0x12002404, + 0x10200404, 0x12200404, 0x10202404, 0x12202404, + }, + { + 0x00000000, 0x00000001, 0x00040000, 0x00040001, + 0x01000000, 0x01000001, 0x01040000, 0x01040001, + 0x00000002, 0x00000003, 0x00040002, 0x00040003, + 0x01000002, 0x01000003, 0x01040002, 0x01040003, + 0x00000200, 0x00000201, 0x00040200, 0x00040201, + 0x01000200, 0x01000201, 0x01040200, 0x01040201, + 0x00000202, 0x00000203, 0x00040202, 0x00040203, + 0x01000202, 0x01000203, 0x01040202, 0x01040203, + 0x08000000, 0x08000001, 0x08040000, 0x08040001, + 0x09000000, 0x09000001, 0x09040000, 0x09040001, + 0x08000002, 0x08000003, 0x08040002, 0x08040003, + 0x09000002, 0x09000003, 0x09040002, 0x09040003, + 0x08000200, 0x08000201, 0x08040200, 0x08040201, + 0x09000200, 0x09000201, 0x09040200, 0x09040201, + 0x08000202, 0x08000203, 0x08040202, 0x08040203, + 0x09000202, 0x09000203, 0x09040202, 0x09040203, + }, + { + 0x00000000, 0x00100000, 0x00000100, 0x00100100, + 0x00000008, 0x00100008, 0x00000108, 0x00100108, + 0x00001000, 0x00101000, 0x00001100, 0x00101100, + 0x00001008, 0x00101008, 0x00001108, 0x00101108, + 0x04000000, 0x04100000, 0x04000100, 0x04100100, + 0x04000008, 0x04100008, 0x04000108, 0x04100108, + 0x04001000, 0x04101000, 0x04001100, 0x04101100, + 0x04001008, 0x04101008, 0x04001108, 0x04101108, + 0x00020000, 0x00120000, 0x00020100, 0x00120100, + 0x00020008, 0x00120008, 0x00020108, 0x00120108, + 0x00021000, 0x00121000, 0x00021100, 0x00121100, + 0x00021008, 0x00121008, 0x00021108, 0x00121108, + 0x04020000, 0x04120000, 0x04020100, 0x04120100, + 0x04020008, 0x04120008, 0x04020108, 0x04120108, + 0x04021000, 0x04121000, 0x04021100, 0x04121100, + 0x04021008, 0x04121008, 0x04021108, 0x04121108, + }, + { + 0x00000000, 0x10000000, 0x00010000, 0x10010000, + 0x00000004, 0x10000004, 0x00010004, 0x10010004, + 0x20000000, 0x30000000, 0x20010000, 0x30010000, + 0x20000004, 0x30000004, 0x20010004, 0x30010004, + 0x00100000, 0x10100000, 0x00110000, 0x10110000, + 0x00100004, 0x10100004, 0x00110004, 0x10110004, + 0x20100000, 0x30100000, 0x20110000, 0x30110000, + 0x20100004, 0x30100004, 0x20110004, 0x30110004, + 0x00001000, 0x10001000, 0x00011000, 0x10011000, + 0x00001004, 0x10001004, 0x00011004, 0x10011004, + 0x20001000, 0x30001000, 0x20011000, 0x30011000, + 0x20001004, 0x30001004, 0x20011004, 0x30011004, + 0x00101000, 0x10101000, 0x00111000, 0x10111000, + 0x00101004, 0x10101004, 0x00111004, 0x10111004, + 0x20101000, 0x30101000, 0x20111000, 0x30111000, + 0x20101004, 0x30101004, 0x20111004, 0x30111004, + }, + { + 0x00000000, 0x08000000, 0x00000008, 0x08000008, + 0x00000400, 0x08000400, 0x00000408, 0x08000408, + 0x00020000, 0x08020000, 0x00020008, 0x08020008, + 0x00020400, 0x08020400, 0x00020408, 0x08020408, + 0x00000001, 0x08000001, 0x00000009, 0x08000009, + 0x00000401, 0x08000401, 0x00000409, 0x08000409, + 0x00020001, 0x08020001, 0x00020009, 0x08020009, + 0x00020401, 0x08020401, 0x00020409, 0x08020409, + 0x02000000, 0x0A000000, 0x02000008, 0x0A000008, + 0x02000400, 0x0A000400, 0x02000408, 0x0A000408, + 0x02020000, 0x0A020000, 0x02020008, 0x0A020008, + 0x02020400, 0x0A020400, 0x02020408, 0x0A020408, + 0x02000001, 0x0A000001, 0x02000009, 0x0A000009, + 0x02000401, 0x0A000401, 0x02000409, 0x0A000409, + 0x02020001, 0x0A020001, 0x02020009, 0x0A020009, + 0x02020401, 0x0A020401, 0x02020409, 0x0A020409, + }, + { + 0x00000000, 0x00000100, 0x00080000, 0x00080100, + 0x01000000, 0x01000100, 0x01080000, 0x01080100, + 0x00000010, 0x00000110, 0x00080010, 0x00080110, + 0x01000010, 0x01000110, 0x01080010, 0x01080110, + 0x00200000, 0x00200100, 0x00280000, 0x00280100, + 0x01200000, 0x01200100, 0x01280000, 0x01280100, + 0x00200010, 0x00200110, 0x00280010, 0x00280110, + 0x01200010, 0x01200110, 0x01280010, 0x01280110, + 0x00000200, 0x00000300, 0x00080200, 0x00080300, + 0x01000200, 0x01000300, 0x01080200, 0x01080300, + 0x00000210, 0x00000310, 0x00080210, 0x00080310, + 0x01000210, 0x01000310, 0x01080210, 0x01080310, + 0x00200200, 0x00200300, 0x00280200, 0x00280300, + 0x01200200, 0x01200300, 0x01280200, 0x01280300, + 0x00200210, 0x00200310, 0x00280210, 0x00280310, + 0x01200210, 0x01200310, 0x01280210, 0x01280310, + }, + { + 0x00000000, 0x04000000, 0x00040000, 0x04040000, + 0x00000002, 0x04000002, 0x00040002, 0x04040002, + 0x00002000, 0x04002000, 0x00042000, 0x04042000, + 0x00002002, 0x04002002, 0x00042002, 0x04042002, + 0x00000020, 0x04000020, 0x00040020, 0x04040020, + 0x00000022, 0x04000022, 0x00040022, 0x04040022, + 0x00002020, 0x04002020, 0x00042020, 0x04042020, + 0x00002022, 0x04002022, 0x00042022, 0x04042022, + 0x00000800, 0x04000800, 0x00040800, 0x04040800, + 0x00000802, 0x04000802, 0x00040802, 0x04040802, + 0x00002800, 0x04002800, 0x00042800, 0x04042800, + 0x00002802, 0x04002802, 0x00042802, 0x04042802, + 0x00000820, 0x04000820, 0x00040820, 0x04040820, + 0x00000822, 0x04000822, 0x00040822, 0x04040822, + 0x00002820, 0x04002820, 0x00042820, 0x04042820, + 0x00002822, 0x04002822, 0x00042822, 0x04042822 + } +}; + +#if VECT_SIZE == 1 +#define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) +#endif + +void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) +{ + u32x r = data[0]; + u32x l = data[1]; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i += 2) + { + u32x u; + u32x t; + + u = Kc[i + 0] ^ rotl32 (r, 30u); + t = Kd[i + 0] ^ rotl32 (r, 26u); + + l ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + + u = Kc[i + 1] ^ rotl32 (l, 30u); + t = Kd[i + 1] ^ rotl32 (l, 26u); + + r ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + } + + iv[0] = l; + iv[1] = r; +} + +void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 (*s_skb)[64]) +{ + u32x tt; + + PERM_OP (d, c, tt, 4, 0x0f0f0f0f); + HPERM_OP (c, tt, 2, 0xcccc0000); + HPERM_OP (d, tt, 2, 0xcccc0000); + PERM_OP (d, c, tt, 1, 0x55555555); + PERM_OP (c, d, tt, 8, 0x00ff00ff); + PERM_OP (d, c, tt, 1, 0x55555555); + + d = ((d & 0x000000ff) << 16) + | ((d & 0x0000ff00) << 0) + | ((d & 0x00ff0000) >> 16) + | ((c & 0xf0000000) >> 4); + + c = c & 0x0fffffff; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i++) + { + if ((i < 2) || (i == 8) || (i == 15)) + { + c = ((c >> 1) | (c << 27)); + d = ((d >> 1) | (d << 27)); + } + else + { + c = ((c >> 2) | (c << 26)); + d = ((d >> 2) | (d << 26)); + } + + c = c & 0x0fffffff; + d = d & 0x0fffffff; + + const u32x c00 = (c >> 0) & 0x0000003f; + const u32x c06 = (c >> 6) & 0x00383003; + const u32x c07 = (c >> 7) & 0x0000003c; + const u32x c13 = (c >> 13) & 0x0000060f; + const u32x c20 = (c >> 20) & 0x00000001; + + u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); + + const u32x d00 = (d >> 0) & 0x00003c3f; + const u32x d07 = (d >> 7) & 0x00003f03; + const u32x d21 = (d >> 21) & 0x0000000f; + const u32x d22 = (d >> 22) & 0x00000030; + + u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); + + Kc[i] = ((t << 16) | (s & 0x0000ffff)); + Kd[i] = ((s >> 16) | (t & 0xffff0000)); + } +} + +void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) +{ + u32x t[8]; + + t[0] = (w0 >> 0) & 0xff; + t[1] = (w0 >> 8) & 0xff; + t[2] = (w0 >> 16) & 0xff; + t[3] = (w0 >> 24) & 0xff; + t[4] = (w1 >> 0) & 0xff; + t[5] = (w1 >> 8) & 0xff; + t[6] = (w1 >> 16) & 0xff; + t[7] = (w1 >> 24) & 0xff; + + u32x k[8]; + + k[0] = (t[0] >> 0); + k[1] = (t[0] << 7) | (t[1] >> 1); + k[2] = (t[1] << 6) | (t[2] >> 2); + k[3] = (t[2] << 5) | (t[3] >> 3); + k[4] = (t[3] << 4) | (t[4] >> 4); + k[5] = (t[4] << 3) | (t[5] >> 5); + k[6] = (t[5] << 2) | (t[6] >> 6); + k[7] = (t[6] << 1); + + out[0] = ((k[0] & 0xff) << 0) + | ((k[1] & 0xff) << 8) + | ((k[2] & 0xff) << 16) + | ((k[3] & 0xff) << 24); + + out[1] = ((k[4] & 0xff) << 0) + | ((k[5] & 0xff) << 8) + | ((k[6] & 0xff) << 16) + | ((k[7] & 0xff) << 24); +} + +__kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_vector_t ctx; + + md4_init_vector (&ctx); + + md4_update_vector_utf16le (&ctx, w, pw_len); + + md4_final_vector (&ctx); + + const u32x a = ctx.h[0]; + const u32x b = ctx.h[1]; + const u32x c = ctx.h[2]; + const u32x d = ctx.h[3]; + + if (MATCHES_NONE_VS ((d >> 16), s2)) continue; + + /** + * DES1 + */ + + u32x key[2]; + + transform_netntlmv1_key (a, b, key); + + u32x Kc[16]; + u32x Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x data[2]; + + data[0] = s0; + data[1] = s1; + + u32x out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + + const u32x r0 = out1[0]; + const u32x r1 = out1[1]; + const u32x r2 = out2[0]; + const u32x r3 = out2[1]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_vector_t ctx; + + md4_init_vector (&ctx); + + md4_update_vector_utf16le (&ctx, w, pw_len); + + md4_final_vector (&ctx); + + const u32x a = ctx.h[0]; + const u32x b = ctx.h[1]; + const u32x c = ctx.h[2]; + const u32x d = ctx.h[3]; + + if (MATCHES_NONE_VS ((d >> 16), s2)) continue; + + /** + * DES1 + */ + + u32x key[2]; + + transform_netntlmv1_key (a, b, key); + + u32x Kc[16]; + u32x Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x data[2]; + + data[0] = s0; + data[1] = s1; + + u32x out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + /* + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + */ + + const u32x r0 = out1[0]; + const u32x r1 = out1[1]; + const u32x r2 = search[2]; + const u32x r3 = search[3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05600_a0.cl b/OpenCL/m05600_a0.cl new file mode 100644 index 000000000..1c2ac82bf --- /dev/null +++ b/OpenCL/m05600_a0.cl @@ -0,0 +1,241 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +__kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05600_a1.cl b/OpenCL/m05600_a1.cl new file mode 100644 index 000000000..5f494e6d7 --- /dev/null +++ b/OpenCL/m05600_a1.cl @@ -0,0 +1,221 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +__kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md4_ctx_t ctx10; + + md4_init (&ctx10); + + md4_update_global_utf16le (&ctx10, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx1 = ctx10; + + md4_update_global_utf16le (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md4_ctx_t ctx10; + + md4_init (&ctx10); + + md4_update_global_utf16le (&ctx10, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx1 = ctx10; + + md4_update_global_utf16le (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05600_a3.cl b/OpenCL/m05600_a3.cl new file mode 100644 index 000000000..7ec0e36a9 --- /dev/null +++ b/OpenCL/m05600_a3.cl @@ -0,0 +1,251 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +__kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl index b84015eef..760557daa 100644 --- a/OpenCL/m05800-optimized.cl +++ b/OpenCL/m05800-optimized.cl @@ -8,6 +8,7 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_sha1.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" @@ -2111,44 +2112,47 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 u32 tmp4; u32 tmp5; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = swap32_S (append[4]); - const int offset_minus_4 = 4 - (offset & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); - tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); - - const u32 mod = offset & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp5 = amd_bytealign (in4, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); + tmp5 = swap32_S (tmp5); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (offset & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], append[4], selector); - tmp5 = __byte_perm (append[4], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); + tmp5 = __byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; @@ -2193,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 } } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m05800.cl b/OpenCL/m05800.cl index 3e9a9c72d..4569e1c41 100644 --- a/OpenCL/m05800.cl +++ b/OpenCL/m05800.cl @@ -2112,44 +2112,47 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 u32 tmp4; u32 tmp5; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = swap32_S (append[4]); - const int offset_minus_4 = 4 - (offset & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); - tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); - - const u32 mod = offset & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp5 = amd_bytealign (in4, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); + tmp5 = swap32_S (tmp5); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (offset & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], append[4], selector); - tmp5 = __byte_perm (append[4], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp3 = __byte_perm (in3, in4, selector); + tmp4 = __byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; @@ -2194,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 } } -void orig_sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { const u32 gid = get_global_id (0); @@ -2392,8 +2267,6 @@ __kernel void m05800_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32 (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -2405,8 +2278,6 @@ __kernel void m05800_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } u32 digest[5]; diff --git a/OpenCL/m06000_a0.cl b/OpenCL/m06000_a0.cl new file mode 100644 index 000000000..cc67d8593 --- /dev/null +++ b/OpenCL/m06000_a0.cl @@ -0,0 +1,130 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_ripemd160.cl" + +__kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + ripemd160_ctx_t ctx; + + ripemd160_init (&ctx); + + ripemd160_update (&ctx, w, pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + ripemd160_ctx_t ctx; + + ripemd160_init (&ctx); + + ripemd160_update (&ctx, w, pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m06000_a1.cl b/OpenCL/m06000_a1.cl new file mode 100644 index 000000000..9aa0a3c17 --- /dev/null +++ b/OpenCL/m06000_a1.cl @@ -0,0 +1,110 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_ripemd160.cl" + +__kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + ripemd160_ctx_t ctx0; + + ripemd160_init (&ctx0); + + ripemd160_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + ripemd160_ctx_t ctx = ctx0; + + ripemd160_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + ripemd160_ctx_t ctx0; + + ripemd160_init (&ctx0); + + ripemd160_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + ripemd160_ctx_t ctx = ctx0; + + ripemd160_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m06000_a3.cl b/OpenCL/m06000_a3.cl new file mode 100644 index 000000000..62f84f02a --- /dev/null +++ b/OpenCL/m06000_a3.cl @@ -0,0 +1,140 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_ripemd160.cl" + +__kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + ripemd160_ctx_vector_t ctx; + + ripemd160_init_vector (&ctx); + + ripemd160_update_vector (&ctx, w, pw_len); + + ripemd160_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + ripemd160_ctx_vector_t ctx; + + ripemd160_init_vector (&ctx); + + ripemd160_update_vector (&ctx, w, pw_len); + + ripemd160_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m06100_a0.cl b/OpenCL/m06100_a0.cl index 48fcad611..5b798d0c5 100644 --- a/OpenCL/m06100_a0.cl +++ b/OpenCL/m06100_a0.cl @@ -70,8 +70,6 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -166,8 +164,6 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m06100_a3.cl b/OpenCL/m06100_a3.cl index da99bac26..a9bd2a358 100644 --- a/OpenCL/m06100_a3.cl +++ b/OpenCL/m06100_a3.cl @@ -68,8 +68,6 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -170,8 +168,6 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl index 0645a12fd..a0c85c49c 100644 --- a/OpenCL/m06300-optimized.cl +++ b/OpenCL/m06300-optimized.cl @@ -8,109 +8,12 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -118,44 +21,45 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -222,7 +126,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -230,44 +134,47 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -334,44 +241,41 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - const int offset_minus_4 = 4 - (block_len & 3); - - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - - const u32 mod = block_len & 3; - - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m06300.cl b/OpenCL/m06300.cl index dbe156e8d..0397d0362 100644 --- a/OpenCL/m06300.cl +++ b/OpenCL/m06300.cl @@ -38,8 +38,6 @@ __kernel void m06300_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -51,8 +49,6 @@ __kernel void m06300_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -146,8 +142,6 @@ __kernel void m06300_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -159,8 +153,6 @@ __kernel void m06300_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m07000_a0.cl b/OpenCL/m07000_a0.cl new file mode 100644 index 000000000..412a929ef --- /dev/null +++ b/OpenCL/m07000_a0.cl @@ -0,0 +1,195 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07000_a1.cl b/OpenCL/m07000_a1.cl new file mode 100644 index 000000000..34e434180 --- /dev/null +++ b/OpenCL/m07000_a1.cl @@ -0,0 +1,170 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07000_a3.cl b/OpenCL/m07000_a3.cl new file mode 100644 index 000000000..6f5bf96ed --- /dev/null +++ b/OpenCL/m07000_a3.cl @@ -0,0 +1,208 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32x p0[4]; + u32x p1[4]; + u32x p2[4]; + u32x p3[4]; + + p0[0] = swap32 (FORTIGATE_A); + p0[1] = swap32 (FORTIGATE_B); + p0[2] = swap32 (FORTIGATE_C); + p0[3] = swap32 (FORTIGATE_D); + p1[0] = swap32 (FORTIGATE_E); + p1[1] = swap32 (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_vector_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32x p0[4]; + u32x p1[4]; + u32x p2[4]; + u32x p3[4]; + + p0[0] = swap32 (FORTIGATE_A); + p0[1] = swap32 (FORTIGATE_B); + p0[2] = swap32 (FORTIGATE_C); + p0[3] = swap32 (FORTIGATE_D); + p1[0] = swap32 (FORTIGATE_E); + p1[1] = swap32 (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_vector_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07300_a0.cl b/OpenCL/m07300_a0.cl new file mode 100644 index 000000000..596db3a54 --- /dev/null +++ b/OpenCL/m07300_a0.cl @@ -0,0 +1,130 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_swap (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_swap (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07300_a1.cl b/OpenCL/m07300_a1.cl new file mode 100644 index 000000000..775d68a0f --- /dev/null +++ b/OpenCL/m07300_a1.cl @@ -0,0 +1,168 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, c, pw_len + comb_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, c, pw_len + comb_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07300_a3.cl b/OpenCL/m07300_a3.cl new file mode 100644 index 000000000..4f5532822 --- /dev/null +++ b/OpenCL/m07300_a3.cl @@ -0,0 +1,140 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0 = w0l | w0r; + + w[0] = w0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0 = w0l | w0r; + + w[0] = w0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index e3a9fbda5..a52655b1b 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -32,8 +32,6 @@ __constant u32a k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -#if 1 - void sha256_transform (const u32 w[16], u32 digest[8]) { u32 a = digest[0]; @@ -190,49 +188,53 @@ void bswap8 (u32 block[16]) block[ 7] = swap32 (block[ 7]); } -u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat16 (u32 block[16], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -322,56 +324,60 @@ u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } -u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u32 append_len, u32 digest[8]) +u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 append_len, u32 digest[8]) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif u32 carry[4] = { 0, 0, 0, 0 }; - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -471,7 +477,7 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; if (new_len >= 64) { @@ -490,49 +496,53 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 return new_len; } -u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat20 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -632,52 +642,58 @@ u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 break; } - return block_len + append_len; + return offset + append_len; } -u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat20_x80 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); - if (mod == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -777,7 +793,7 @@ u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const break; } - return block_len + append_len; + return offset + append_len; } __kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -1213,543 +1229,3 @@ __kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rul #include COMPARE_M } - -#else - -// this is basically a much cleaner version, but apparently drops speeds by over 100% :( - -#define PUTCHAR32_BE(a,p,c) ((u8 *)(a))[(p) ^ 3] = (u8) (c) -#define GETCHAR32_BE(a,p) ((u8 *)(a))[(p) ^ 3] - -typedef struct -{ - u32 state[8]; - u32 buf[32]; - int len; - -} sha256_ctx_t; - -void sha256_transform (const u32 w[16], u32 digest[8]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; - - u32 w0_t = w[ 0]; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; - - #define ROUND_EXPAND() \ - { \ - w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \ - w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \ - w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \ - w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \ - w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \ - w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \ - w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \ - w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \ - w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \ - w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \ - wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \ - wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \ - wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \ - wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \ - we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \ - wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \ - } - - #define ROUND_STEP(i) \ - { \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ - } - - ROUND_STEP (0); - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 16; i < 64; i += 16) - { - ROUND_EXPAND (); ROUND_STEP (i); - } - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; - digest[4] += e; - digest[5] += f; - digest[6] += g; - digest[7] += h; -} - -void sha256_init (sha256_ctx_t *sha256_ctx) -{ - sha256_ctx->state[0] = SHA256M_A; - sha256_ctx->state[1] = SHA256M_B; - sha256_ctx->state[2] = SHA256M_C; - sha256_ctx->state[3] = SHA256M_D; - sha256_ctx->state[4] = SHA256M_E; - sha256_ctx->state[5] = SHA256M_F; - sha256_ctx->state[6] = SHA256M_G; - sha256_ctx->state[7] = SHA256M_H; - - sha256_ctx->len = 0; -} - -void sha256_update (sha256_ctx_t *sha256_ctx, const u32 *buf, int len) -{ - int pos = sha256_ctx->len & 0x3f; - - sha256_ctx->len += len; - - if ((pos + len) < 64) - { - for (int i = 0; i < len; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i)); - } - - return; - } - - int cnt = 64 - pos; - - for (int i = 0; i < cnt; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i)); - } - - sha256_transform (sha256_ctx->buf, sha256_ctx->state); - - len -= cnt; - - for (int i = 0; i < len; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, i, GETCHAR32_BE (buf, cnt + i)); - } -} - -void sha256_final (sha256_ctx_t *sha256_ctx) -{ - int pos = sha256_ctx->len & 0x3f; - - for (int i = pos; i < 64; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, i, 0); - } - - PUTCHAR32_BE (sha256_ctx->buf, pos, 0x80); - - if (pos >= 56) - { - sha256_transform (sha256_ctx->buf, sha256_ctx->state); - - sha256_ctx->buf[ 0] = 0; - sha256_ctx->buf[ 1] = 0; - sha256_ctx->buf[ 2] = 0; - sha256_ctx->buf[ 3] = 0; - sha256_ctx->buf[ 4] = 0; - sha256_ctx->buf[ 5] = 0; - sha256_ctx->buf[ 6] = 0; - sha256_ctx->buf[ 7] = 0; - sha256_ctx->buf[ 8] = 0; - sha256_ctx->buf[ 9] = 0; - sha256_ctx->buf[10] = 0; - sha256_ctx->buf[11] = 0; - sha256_ctx->buf[12] = 0; - sha256_ctx->buf[13] = 0; - sha256_ctx->buf[14] = 0; - sha256_ctx->buf[15] = 0; - } - - sha256_ctx->buf[15] = sha256_ctx->len * 8; - - sha256_transform (sha256_ctx->buf, sha256_ctx->state); -} - -__kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - u32 pw[4]; - - pw[0] = swap32 (pws[gid].i[0]); - pw[1] = swap32 (pws[gid].i[1]); - pw[2] = swap32 (pws[gid].i[2]); - pw[3] = swap32 (pws[gid].i[3]); - - const u32 pw_len = pws[gid].pw_len; - - /** - * salt - */ - - u32 salt[4]; - - salt[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); - salt[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); - salt[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); - salt[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); - - u32 salt_len = salt_bufs[salt_pos].salt_len; - - /** - * begin - */ - - sha256_ctx_t sha256_ctx; - - sha256_init (&sha256_ctx); - - sha256_update (&sha256_ctx, pw, pw_len); - sha256_update (&sha256_ctx, salt, salt_len); - sha256_update (&sha256_ctx, pw, pw_len); - - sha256_final (&sha256_ctx); - - u32 tmp[8]; - - tmp[0] = sha256_ctx.state[0]; - tmp[1] = sha256_ctx.state[1]; - tmp[2] = sha256_ctx.state[2]; - tmp[3] = sha256_ctx.state[3]; - tmp[4] = sha256_ctx.state[4]; - tmp[5] = sha256_ctx.state[5]; - tmp[6] = sha256_ctx.state[6]; - tmp[7] = sha256_ctx.state[7]; - - sha256_init (&sha256_ctx); - - sha256_update (&sha256_ctx, pw, pw_len); - sha256_update (&sha256_ctx, salt, salt_len); - sha256_update (&sha256_ctx, tmp, pw_len); - - for (u32 j = pw_len; j; j >>= 1) - { - if (j & 1) - { - sha256_update (&sha256_ctx, tmp, 32); - } - else - { - sha256_update (&sha256_ctx, pw, pw_len); - } - } - - sha256_final (&sha256_ctx); - - tmps[gid].alt_result[0] = sha256_ctx.state[0]; - tmps[gid].alt_result[1] = sha256_ctx.state[1]; - tmps[gid].alt_result[2] = sha256_ctx.state[2]; - tmps[gid].alt_result[3] = sha256_ctx.state[3]; - tmps[gid].alt_result[4] = sha256_ctx.state[4]; - tmps[gid].alt_result[5] = sha256_ctx.state[5]; - tmps[gid].alt_result[6] = sha256_ctx.state[6]; - tmps[gid].alt_result[7] = sha256_ctx.state[7]; - - // p_bytes - - sha256_init (&sha256_ctx); - - for (u32 j = 0; j < pw_len; j++) - { - sha256_update (&sha256_ctx, pw, pw_len); - } - - sha256_final (&sha256_ctx); - - tmps[gid].p_bytes[0] = sha256_ctx.state[0]; - tmps[gid].p_bytes[1] = sha256_ctx.state[1]; - tmps[gid].p_bytes[2] = sha256_ctx.state[2]; - tmps[gid].p_bytes[3] = sha256_ctx.state[3]; - - // s_bytes - - sha256_init (&sha256_ctx); - - for (u32 j = 0; j < 16 + ((tmps[gid].alt_result[0] >> 24) & 0xff); j++) - { - sha256_update (&sha256_ctx, salt, salt_len); - } - - sha256_final (&sha256_ctx); - - tmps[gid].s_bytes[0] = sha256_ctx.state[0]; - tmps[gid].s_bytes[1] = sha256_ctx.state[1]; - tmps[gid].s_bytes[2] = sha256_ctx.state[2]; - tmps[gid].s_bytes[3] = sha256_ctx.state[3]; -} - -__kernel void m07400_loop (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - u32 p_bytes0[4]; - - p_bytes0[0] = tmps[gid].p_bytes[0]; - p_bytes0[1] = tmps[gid].p_bytes[1]; - p_bytes0[2] = tmps[gid].p_bytes[2]; - p_bytes0[3] = tmps[gid].p_bytes[3]; - - const u32 pw_len = pws[gid].pw_len; - - u32 s_bytes0[4]; - - s_bytes0[0] = tmps[gid].s_bytes[0]; - s_bytes0[1] = tmps[gid].s_bytes[1]; - s_bytes0[2] = tmps[gid].s_bytes[2]; - s_bytes0[3] = tmps[gid].s_bytes[3]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; - - u32 wpc_len[8]; - - wpc_len[0] = 32 + 0 + 0 + pw_len; - wpc_len[1] = pw_len + 0 + 0 + 32; - wpc_len[2] = 32 + salt_len + 0 + pw_len; - wpc_len[3] = pw_len + salt_len + 0 + 32; - wpc_len[4] = 32 + 0 + pw_len + pw_len; - wpc_len[5] = pw_len + 0 + pw_len + 32; - wpc_len[6] = 32 + salt_len + pw_len + pw_len; - wpc_len[7] = pw_len + salt_len + pw_len + 32; - - u32 wpc[8][32] = { { 0 } }; - - for (u32 i = 0; i < 8; i++) - { - u32 block_len = 0; - - if (i & 1) - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - else - { - block_len += 32; - } - - if (i & 2) - { - for (u32 j = 0; j < salt_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (s_bytes0, j)); - } - } - - if (i & 4) - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - - if (i & 1) - { - block_len += 32; - } - else - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - - PUTCHAR32_BE (wpc[i], block_len, 0x80); - - if (block_len < 56) - { - wpc[i][15] = block_len * 8; - } - else - { - wpc[i][31] = block_len * 8; - } - } - - /** - * base - */ - - u32 alt_result[8]; - - alt_result[0] = tmps[gid].alt_result[0]; - alt_result[1] = tmps[gid].alt_result[1]; - alt_result[2] = tmps[gid].alt_result[2]; - alt_result[3] = tmps[gid].alt_result[3]; - alt_result[4] = tmps[gid].alt_result[4]; - alt_result[5] = tmps[gid].alt_result[5]; - alt_result[6] = tmps[gid].alt_result[6]; - alt_result[7] = tmps[gid].alt_result[7]; - - /* Repeatedly run the collected hash value through SHA256 to burn - CPU cycles. */ - - for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++) - { - const u32 j1 = (j & 1) ? 1 : 0; - const u32 j3 = (j % 3) ? 2 : 0; - const u32 j7 = (j % 7) ? 4 : 0; - - const u32 pc = j1 + j3 + j7; - - u32 block[32]; - - block[ 0] = wpc[pc][ 0]; - block[ 1] = wpc[pc][ 1]; - block[ 2] = wpc[pc][ 2]; - block[ 3] = wpc[pc][ 3]; - block[ 4] = wpc[pc][ 4]; - block[ 5] = wpc[pc][ 5]; - block[ 6] = wpc[pc][ 6]; - block[ 7] = wpc[pc][ 7]; - block[ 8] = wpc[pc][ 8]; - block[ 9] = wpc[pc][ 9]; - block[10] = wpc[pc][10]; - block[11] = wpc[pc][11]; - block[12] = wpc[pc][12]; - block[13] = wpc[pc][13]; - block[14] = wpc[pc][14]; - block[15] = wpc[pc][15]; - block[16] = wpc[pc][16]; - block[17] = wpc[pc][17]; - block[18] = wpc[pc][18]; - block[19] = wpc[pc][19]; - block[20] = wpc[pc][20]; - block[21] = wpc[pc][21]; - block[22] = wpc[pc][22]; - block[23] = wpc[pc][23]; - block[24] = wpc[pc][24]; - block[25] = wpc[pc][25]; - block[26] = wpc[pc][26]; - block[27] = wpc[pc][27]; - block[28] = wpc[pc][28]; - block[29] = wpc[pc][29]; - block[30] = wpc[pc][30]; - block[31] = wpc[pc][31]; - - const u32 block_len = wpc_len[pc]; - - if (j1) - { - #ifdef _unroll - #pragma unroll - #endif - for (u32 k = 0, p = block_len - 32; k < 32; k++, p++) - { - PUTCHAR32_BE (block, p, GETCHAR32_BE (alt_result, k)); - } - } - else - { - block[0] = alt_result[0]; - block[1] = alt_result[1]; - block[2] = alt_result[2]; - block[3] = alt_result[3]; - block[4] = alt_result[4]; - block[5] = alt_result[5]; - block[6] = alt_result[6]; - block[7] = alt_result[7]; - } - - alt_result[0] = SHA256M_A; - alt_result[1] = SHA256M_B; - alt_result[2] = SHA256M_C; - alt_result[3] = SHA256M_D; - alt_result[4] = SHA256M_E; - alt_result[5] = SHA256M_F; - alt_result[6] = SHA256M_G; - alt_result[7] = SHA256M_H; - - sha256_transform (block, alt_result); - - if (block_len >= 56) - { - sha256_transform (block + 16, alt_result); - } - } - - tmps[gid].alt_result[0] = alt_result[0]; - tmps[gid].alt_result[1] = alt_result[1]; - tmps[gid].alt_result[2] = alt_result[2]; - tmps[gid].alt_result[3] = alt_result[3]; - tmps[gid].alt_result[4] = alt_result[4]; - tmps[gid].alt_result[5] = alt_result[5]; - tmps[gid].alt_result[6] = alt_result[6]; - tmps[gid].alt_result[7] = alt_result[7]; -} - -__kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - const u32 lid = get_local_id (0); - - const u32 r0 = swap32 (tmps[gid].alt_result[0]); - const u32 r1 = swap32 (tmps[gid].alt_result[1]); - const u32 r2 = swap32 (tmps[gid].alt_result[2]); - const u32 r3 = swap32 (tmps[gid].alt_result[3]); - - #define il_pos 0 - - #include COMPARE_M -} - -#endif diff --git a/OpenCL/m07400.cl b/OpenCL/m07400.cl index 936add9f0..9eda4d84a 100644 --- a/OpenCL/m07400.cl +++ b/OpenCL/m07400.cl @@ -36,8 +36,6 @@ __kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < pw_lenv; idx++) @@ -54,8 +52,6 @@ __kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < salt_lenv; idx++) diff --git a/OpenCL/m07500_a0.cl b/OpenCL/m07500_a0.cl new file mode 100644 index 000000000..72491071f --- /dev/null +++ b/OpenCL/m07500_a0.cl @@ -0,0 +1,421 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], u32 timestamp_ct[8]) +{ + rc4_init_16 (rc4_key, data); + + u32 out[4]; + + u8 j = 0; + + j = rc4_next_16 (rc4_key, 0, j, timestamp_ct + 0, out); + + if ((out[3] & 0xffff0000) != 0x30320000) return 0; + + j = rc4_next_16 (rc4_key, 16, j, timestamp_ct + 4, out); + + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 1; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; +} + +__kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m07500_a1.cl b/OpenCL/m07500_a1.cl new file mode 100644 index 000000000..0942bb921 --- /dev/null +++ b/OpenCL/m07500_a1.cl @@ -0,0 +1,401 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], u32 timestamp_ct[8]) +{ + rc4_init_16 (rc4_key, data); + + u32 out[4]; + + u8 j = 0; + + j = rc4_next_16 (rc4_key, 0, j, timestamp_ct + 0, out); + + if ((out[3] & 0xffff0000) != 0x30320000) return 0; + + j = rc4_next_16 (rc4_key, 16, j, timestamp_ct + 4, out); + + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 1; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; +} + +__kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m07500_a3.cl b/OpenCL/m07500_a3.cl new file mode 100644 index 000000000..9271e3924 --- /dev/null +++ b/OpenCL/m07500_a3.cl @@ -0,0 +1,455 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], u32 timestamp_ct[8]) +{ + rc4_init_16 (rc4_key, data); + + u32 out[4]; + + u8 j = 0; + + j = rc4_next_16 (rc4_key, 0, j, timestamp_ct + 0, out); + + if ((out[3] & 0xffff0000) != 0x30320000) return 0; + + j = rc4_next_16 (rc4_key, 16, j, timestamp_ct + 4, out); + + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 1; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; +} + +__kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m07700_a0-optimized.cl b/OpenCL/m07700_a0-optimized.cl index ce1601902..8a6b88cf1 100644 --- a/OpenCL/m07700_a0-optimized.cl +++ b/OpenCL/m07700_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -329,180 +330,41 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, out_len, salt_buf0, salt_len, a, b, c, d, t); - - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_M_SIMD (a, b, c, d); + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -644,180 +506,41 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, out_len, salt_buf0, salt_len, a, b, c, d, t); - - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_S_SIMD (a, b, c, d); + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m07700_a1-optimized.cl b/OpenCL/m07700_a1-optimized.cl index 122a2b86e..d2d0350c7 100644 --- a/OpenCL/m07700_a1-optimized.cl +++ b/OpenCL/m07700_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -370,180 +371,41 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_M_SIMD (a, b, c, d); + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -728,180 +590,41 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_S_SIMD (a, b, c, d); + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m07700_a3-optimized.cl b/OpenCL/m07700_a3-optimized.cl index d5387f9ed..9b0b3acdd 100644 --- a/OpenCL/m07700_a3-optimized.cl +++ b/OpenCL/m07700_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -85,10 +86,10 @@ u32 walld0rf_magic (const u32 w0[4], const u32 pw_len, const u32 salt_buf0[4], c t[15] = 0; u32 sum20 = ((a >> 24) & 3) - + ((a >> 16) & 3) - + ((a >> 8) & 3) - + ((a >> 0) & 3) - + ((b >> 8) & 3); + + ((a >> 16) & 3) + + ((a >> 8) & 3) + + ((a >> 0) & 3) + + ((b >> 8) & 3); sum20 |= 0x20; @@ -259,6 +260,8 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl s3[2] = 0; s3[3] = 0; + append_0x80_4x4_S (s0, s1, s2, s3, salt_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -284,7 +287,7 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[ 2] = s0[2]; t[ 3] = s0[3]; t[ 4] = s1[0]; - t[ 5] = 0; + t[ 5] = s1[1]; t[ 6] = 0; t[ 7] = 0; t[ 8] = 0; @@ -296,180 +299,39 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); - /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_M_SIMD (a, b, c, d); + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -523,6 +385,8 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl s3[2] = 0; s3[3] = 0; + append_0x80_4x4_S (s0, s1, s2, s3, salt_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -560,7 +424,7 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[ 2] = s0[2]; t[ 3] = s0[3]; t[ 4] = s1[0]; - t[ 5] = 0; + t[ 5] = s1[1]; t[ 6] = 0; t[ 7] = 0; t[ 8] = 0; @@ -572,180 +436,39 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); - /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_S_SIMD (a, b, c, d); + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m07800_a0-optimized.cl b/OpenCL/m07800_a0-optimized.cl index 9c0edfb9d..616e5483c 100644 --- a/OpenCL/m07800_a0-optimized.cl +++ b/OpenCL/m07800_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" __constant u32 theMagicArray[64] = { @@ -52,134 +53,6 @@ void SETSHIFTEDINT (u32 *a, const int n, const u32 v) a[d + 1] = l32_from_64_S (tmp); } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m07800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m07800_a1-optimized.cl b/OpenCL/m07800_a1-optimized.cl index 1d6e36112..3de195d64 100644 --- a/OpenCL/m07800_a1-optimized.cl +++ b/OpenCL/m07800_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" __constant u32 theMagicArray[64] = { @@ -50,134 +51,6 @@ void SETSHIFTEDINT (u32 *a, const int n, const u32 v) a[d + 1] = l32_from_64_S (tmp); } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m07800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m07800_a3-optimized.cl b/OpenCL/m07800_a3-optimized.cl index 2dfe9f04b..b57f9496b 100644 --- a/OpenCL/m07800_a3-optimized.cl +++ b/OpenCL/m07800_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" __constant u32 theMagicArray[64] = { @@ -50,134 +51,6 @@ void SETSHIFTEDINT (u32 *a, const int n, const u32 v) a[d + 1] = l32_from_64_S (tmp); } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - void m07800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m07900.cl b/OpenCL/m07900.cl index d50b1d4c2..eaca08e00 100644 --- a/OpenCL/m07900.cl +++ b/OpenCL/m07900.cl @@ -66,8 +66,6 @@ __kernel void m07900_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < pw_lenv; idx++) diff --git a/OpenCL/m08100_a0.cl b/OpenCL/m08100_a0.cl new file mode 100644 index 000000000..3f3839efa --- /dev/null +++ b/OpenCL/m08100_a0.cl @@ -0,0 +1,138 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08100_a1.cl b/OpenCL/m08100_a1.cl new file mode 100644 index 000000000..6fd9dcbec --- /dev/null +++ b/OpenCL/m08100_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08100_a3.cl b/OpenCL/m08100_a3.cl new file mode 100644 index 000000000..746fd240b --- /dev/null +++ b/OpenCL/m08100_a3.cl @@ -0,0 +1,152 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector (&ctx, w, pw_len + 1); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector (&ctx, w, pw_len + 1); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08300_a0-optimized.cl b/OpenCL/m08300_a0-optimized.cl index 654747a75..0e3ed4fd2 100644 --- a/OpenCL/m08300_a0-optimized.cl +++ b/OpenCL/m08300_a0-optimized.cl @@ -13,134 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -204,7 +77,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * loop @@ -355,7 +228,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -384,7 +257,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); @@ -461,7 +334,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * digest @@ -624,7 +497,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -653,7 +526,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); diff --git a/OpenCL/m08300_a0.cl b/OpenCL/m08300_a0.cl new file mode 100644 index 000000000..906050bef --- /dev/null +++ b/OpenCL/m08300_a0.cl @@ -0,0 +1,264 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08300_a1-optimized.cl b/OpenCL/m08300_a1-optimized.cl index 889ee8c28..3ed52170c 100644 --- a/OpenCL/m08300_a1-optimized.cl +++ b/OpenCL/m08300_a1-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -202,7 +75,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * loop @@ -413,7 +286,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -442,7 +315,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); @@ -519,7 +392,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * digest @@ -742,7 +615,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -771,7 +644,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); diff --git a/OpenCL/m08300_a1.cl b/OpenCL/m08300_a1.cl new file mode 100644 index 000000000..c5fd82a79 --- /dev/null +++ b/OpenCL/m08300_a1.cl @@ -0,0 +1,240 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = ((pws[gid].pw_len + combs_buf[il_pos].pw_len) & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_global_swap (&ctx1, pws[gid].i, pws[gid].pw_len); + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = ((pws[gid].pw_len + combs_buf[il_pos].pw_len) & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_global_swap (&ctx1, pws[gid].i, pws[gid].pw_len); + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08300_a3-optimized.cl b/OpenCL/m08300_a3-optimized.cl index 2aaa7222f..6827e92f9 100644 --- a/OpenCL/m08300_a3-optimized.cl +++ b/OpenCL/m08300_a3-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -181,7 +54,7 @@ void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl domain_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[ 6]); domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; u32 s0[4]; u32 s1[4]; @@ -312,7 +185,7 @@ void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -341,7 +214,7 @@ void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); @@ -389,7 +262,7 @@ void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl domain_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[ 6]); domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; u32 s0[4]; u32 s1[4]; @@ -532,7 +405,7 @@ void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -561,7 +434,7 @@ void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); diff --git a/OpenCL/m08300_a3.cl b/OpenCL/m08300_a3.cl new file mode 100644 index 000000000..c15333c33 --- /dev/null +++ b/OpenCL/m08300_a3.cl @@ -0,0 +1,274 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32x s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32 (salt_bufs[salt_pos].salt_buf_pc[idx]); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_update_vector (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update_vector (&ctx1, s, salt_len); + + sha1_final_vector (&ctx1); + + u32x digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32x r0 = digest[DGST_R0]; + const u32x r1 = digest[DGST_R1]; + const u32x r2 = digest[DGST_R2]; + const u32x r3 = digest[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32x s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32 (salt_bufs[salt_pos].salt_buf_pc[idx]); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_update_vector (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update_vector (&ctx1, s, salt_len); + + sha1_final_vector (&ctx1); + + u32x digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32x r0 = digest[DGST_R0]; + const u32x r1 = digest[DGST_R1]; + const u32x r2 = digest[DGST_R2]; + const u32x r3 = digest[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08400_a0.cl b/OpenCL/m08400_a0.cl new file mode 100644 index 000000000..123887eb7 --- /dev/null +++ b/OpenCL/m08400_a0.cl @@ -0,0 +1,356 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08400_a1.cl b/OpenCL/m08400_a1.cl new file mode 100644 index 000000000..f2ecd46e6 --- /dev/null +++ b/OpenCL/m08400_a1.cl @@ -0,0 +1,336 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08400_a3.cl b/OpenCL/m08400_a3.cl new file mode 100644 index 000000000..541cd096f --- /dev/null +++ b/OpenCL/m08400_a3.cl @@ -0,0 +1,374 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + u32x a = ctx1.h[0]; + u32x b = ctx1.h[1]; + u32x c = ctx1.h[2]; + u32x d = ctx1.h[3]; + u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector_from_scalar (&ctx2, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + u32x a = ctx1.h[0]; + u32x b = ctx1.h[1]; + u32x c = ctx1.h[2]; + u32x d = ctx1.h[3]; + u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector_from_scalar (&ctx2, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08500_a0-optimized.cl b/OpenCL/m08500_a0.cl similarity index 83% rename from OpenCL/m08500_a0-optimized.cl rename to OpenCL/m08500_a0.cl index ba9bc66fd..2a1d7a98a 100644 --- a/OpenCL/m08500_a0-optimized.cl +++ b/OpenCL/m08500_a0.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -508,20 +508,20 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_racf_key (const u32x w0, const u32x w1, u32x key[2], __local u32 *s_ascii_to_ebcdic_pc) +void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) { - key[0] = BOX1 (((w0 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w0 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w0 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w0 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[0] = BOX1 (((w0 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w0 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w0 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w0 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; - key[1] = BOX1 (((w1 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w1 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w1 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w1 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[1] = BOX1 (((w1 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w1 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w1 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } -__kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -535,7 +535,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -560,11 +559,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -615,7 +609,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; @@ -640,15 +634,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -662,7 +648,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -687,11 +672,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -754,7 +734,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; @@ -778,11 +758,3 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (iv[0], iv[1], z, z); } } - -__kernel void m08500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08500_a1-optimized.cl b/OpenCL/m08500_a1.cl similarity index 84% rename from OpenCL/m08500_a1-optimized.cl rename to OpenCL/m08500_a1.cl index f76728dd8..50d221f52 100644 --- a/OpenCL/m08500_a1-optimized.cl +++ b/OpenCL/m08500_a1.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -506,20 +506,20 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_racf_key (const u32x w0, const u32x w1, u32x key[2], __local u32 *s_ascii_to_ebcdic_pc) +void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) { - key[0] = BOX1 (((w0 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w0 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w0 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w0 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[0] = BOX1 (((w0 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w0 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w0 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w0 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; - key[1] = BOX1 (((w1 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w1 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w1 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w1 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[1] = BOX1 (((w1 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w1 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w1 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } -__kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -533,7 +533,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -558,11 +557,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -656,7 +650,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; @@ -681,15 +675,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -703,7 +689,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -728,11 +713,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -838,7 +818,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; @@ -862,11 +842,3 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (iv[0], iv[1], z, z); } } - -__kernel void m08500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08500_a3-optimized.cl b/OpenCL/m08500_a3.cl similarity index 76% rename from OpenCL/m08500_a3-optimized.cl rename to OpenCL/m08500_a3.cl index 3bdeae561..385ba2ee4 100644 --- a/OpenCL/m08500_a3-optimized.cl +++ b/OpenCL/m08500_a3.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -506,20 +506,20 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_racf_key (const u32x w0, const u32x w1, u32x key[2], __local u32 *s_ascii_to_ebcdic_pc) +void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) { - key[0] = BOX1 (((w0 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w0 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w0 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w0 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[0] = BOX1 (((w0 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w0 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w0 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w0 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; - key[1] = BOX1 (((w1 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w1 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w1 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w1 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[1] = BOX1 (((w1 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w1 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w1 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } -void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u32 *s_ascii_to_ebcdic_pc, u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -557,7 +557,7 @@ void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u3 u32x key[2]; - transform_racf_key (w0, w1, key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0, w1, key); const u32x c = key[0]; const u32x d = key[1]; @@ -582,7 +582,7 @@ void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u3 } } -void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u32 *s_ascii_to_ebcdic_pc, u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -632,7 +632,7 @@ void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u3 u32x key[2]; - transform_racf_key (w0, w1, key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0, w1, key); const u32x c = key[0]; const u32x d = key[1]; @@ -657,7 +657,7 @@ void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u3 } } -__kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -671,7 +671,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -696,11 +695,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -734,18 +728,10 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m08500m (s_SPtrans, s_skb, s_ascii_to_ebcdic_pc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m08500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m08500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -759,7 +745,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -784,11 +769,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -822,13 +802,5 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m08500s (s_SPtrans, s_skb, s_ascii_to_ebcdic_pc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m08500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ + m08500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } diff --git a/OpenCL/m08600_a0-optimized.cl b/OpenCL/m08600_a0.cl similarity index 71% rename from OpenCL/m08600_a0-optimized.cl rename to OpenCL/m08600_a0.cl index 8e5b7e9a2..4e65a519b 100644 --- a/OpenCL/m08600_a0-optimized.cl +++ b/OpenCL/m08600_a0.cl @@ -86,7 +86,7 @@ void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) } } -void lotus_transform_password (u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) +void lotus_transform_password (const u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) { u32x t = out[3] >> 24; @@ -183,7 +183,7 @@ void pad (u32 w[4], const u32 len) } } -void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform_norecalc (u32x state[4], const u32x block[4], __local u32 *s_lotus_magic_table) { u32x x[12]; @@ -208,7 +208,7 @@ void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_ma state[3] = x[3]; } -void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform (u32x state[4], u32x checksum[4], const u32x block[4], __local u32 *s_lotus_magic_table) { mdtransform_norecalc (state, block, s_lotus_magic_table); @@ -229,7 +229,7 @@ void domino_big_md (const u32x saved_key[4], const u32 size, u32x state[4], __lo mdtransform_norecalc (state, checksum, s_lotus_magic_table); } -__kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -308,15 +308,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -406,11 +398,3 @@ __kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } } - -__kernel void m08600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08600_a1-optimized.cl b/OpenCL/m08600_a1.cl similarity index 75% rename from OpenCL/m08600_a1-optimized.cl rename to OpenCL/m08600_a1.cl index e8b143ca2..d169f79bf 100644 --- a/OpenCL/m08600_a1-optimized.cl +++ b/OpenCL/m08600_a1.cl @@ -84,7 +84,7 @@ void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) } } -void lotus_transform_password (u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) +void lotus_transform_password (const u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) { u32x t = out[3] >> 24; @@ -181,7 +181,7 @@ void pad (u32 w[4], const u32 len) } } -void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform_norecalc (u32x state[4], const u32x block[4], __local u32 *s_lotus_magic_table) { u32x x[12]; @@ -206,7 +206,7 @@ void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_ma state[3] = x[3]; } -void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform (u32x state[4], u32x checksum[4], const u32x block[4], __local u32 *s_lotus_magic_table) { mdtransform_norecalc (state, block, s_lotus_magic_table); @@ -227,7 +227,7 @@ void domino_big_md (const u32x saved_key[4], const u32 size, u32x state[4], __lo mdtransform_norecalc (state, checksum, s_lotus_magic_table); } -__kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -351,15 +351,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -494,11 +486,3 @@ __kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } } - -__kernel void m08600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08600_a3-optimized.cl b/OpenCL/m08600_a3.cl similarity index 59% rename from OpenCL/m08600_a3-optimized.cl rename to OpenCL/m08600_a3.cl index f2ddc9df5..4c881033e 100644 --- a/OpenCL/m08600_a3-optimized.cl +++ b/OpenCL/m08600_a3.cl @@ -83,7 +83,7 @@ void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) } } -void lotus_transform_password (u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) +void lotus_transform_password (const u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) { u32x t = out[3] >> 24; @@ -180,7 +180,7 @@ void pad (u32 w[4], const u32 len) } } -void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform_norecalc (u32x state[4], const u32x block[4], __local u32 *s_lotus_magic_table) { u32x x[12]; @@ -205,7 +205,7 @@ void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_ma state[3] = x[3]; } -void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform (u32x state[4], u32x checksum[4], const u32x block[4], __local u32 *s_lotus_magic_table) { mdtransform_norecalc (state, block, s_lotus_magic_table); @@ -332,7 +332,7 @@ void m08600s (__local u32 *s_lotus_magic_table, u32 w[16], const u32 pw_len, __g } } -__kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -389,121 +389,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rule m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m08600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m08600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = pws[gid].i[ 8]; - w[ 9] = pws[gid].i[ 9]; - w[10] = pws[gid].i[10]; - w[11] = pws[gid].i[11]; - w[12] = pws[gid].i[12]; - w[13] = pws[gid].i[13]; - w[14] = pws[gid].i[14]; - w[15] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -559,117 +445,3 @@ __kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rule m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } - -__kernel void m08600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m08600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = pws[gid].i[ 8]; - w[ 9] = pws[gid].i[ 9]; - w[10] = pws[gid].i[10]; - w[11] = pws[gid].i[11]; - w[12] = pws[gid].i[12]; - w[13] = pws[gid].i[13]; - w[14] = pws[gid].i[14]; - w[15] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} diff --git a/OpenCL/m08900.cl b/OpenCL/m08900.cl index 402c2e0c0..4d3dd3913 100644 --- a/OpenCL/m08900.cl +++ b/OpenCL/m08900.cl @@ -138,6 +138,16 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 xd4 = x / 4; const u32 xm4 = x & 3; + __global uint4 *V; + + switch (xm4) + { + case 0: V = V0; break; + case 1: V = V1; break; + case 2: V = V2; break; + case 3: V = V3; break; + } + #ifdef _unroll #pragma unroll #endif @@ -156,13 +166,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ for (u32 y = 0; y < ySIZE; y++) { - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) V0[CO] = X[z]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) V1[CO] = X[z]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) V2[CO] = X[z]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) V3[CO] = X[z]; break; - } + for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } @@ -175,13 +179,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 km = k - (y * SCRYPT_TMTO); - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) T[z] = V0[CO]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) T[z] = V1[CO]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) T[z] = V2[CO]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) T[z] = V3[CO]; break; - } + for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; for (u32 i = 0; i < km; i++) salsa_r (T); @@ -267,8 +265,6 @@ __kernel void m08900_init (__global pw_t *pws, __global const kernel_rule_t *rul const uint4 tmp0 = (uint4) (digest[0], digest[1], digest[2], digest[3]); const uint4 tmp1 = (uint4) (digest[4], digest[5], digest[6], digest[7]); - barrier (CLK_GLOBAL_MEM_FENCE); - tmps[gid].P[k + 0] = tmp0; tmps[gid].P[k + 1] = tmp1; } @@ -333,8 +329,6 @@ __kernel void m08900_comp (__global pw_t *pws, __global const kernel_rule_t *rul for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { - barrier (CLK_GLOBAL_MEM_FENCE); - uint4 tmp; tmp = tmps[gid].P[l + 0]; diff --git a/OpenCL/m09700_a0-optimized.cl b/OpenCL/m09700_a0-optimized.cl index 6262cd701..49fc39362 100644 --- a/OpenCL/m09700_a0-optimized.cl +++ b/OpenCL/m09700_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -138,104 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4]) { u32 digest_t0[2]; diff --git a/OpenCL/m09700_a1-optimized.cl b/OpenCL/m09700_a1-optimized.cl index 50dc116e9..5956d9d52 100644 --- a/OpenCL/m09700_a1-optimized.cl +++ b/OpenCL/m09700_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -136,104 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4]) { u32 digest_t0[2]; diff --git a/OpenCL/m09700_a3-optimized.cl b/OpenCL/m09700_a3-optimized.cl index 645036a74..306f42dfc 100644 --- a/OpenCL/m09700_a3-optimized.cl +++ b/OpenCL/m09700_a3-optimized.cl @@ -9,6 +9,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -133,104 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m09700m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m09710_a0-optimized.cl b/OpenCL/m09710_a0-optimized.cl index b0676599d..4b72fcf7d 100644 --- a/OpenCL/m09710_a0-optimized.cl +++ b/OpenCL/m09710_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -138,104 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m09710_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09710_a1-optimized.cl b/OpenCL/m09710_a1-optimized.cl index 9aa5bc540..6bbfb702f 100644 --- a/OpenCL/m09710_a1-optimized.cl +++ b/OpenCL/m09710_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -136,104 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m09710_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09710_a3-optimized.cl b/OpenCL/m09710_a3-optimized.cl index 3333fadb9..5b50fefa7 100644 --- a/OpenCL/m09710_a3-optimized.cl +++ b/OpenCL/m09710_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -136,104 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m09710m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m09720_a0-optimized.cl b/OpenCL/m09720_a0-optimized.cl index 587bedb67..a8a0523d2 100644 --- a/OpenCL/m09720_a0-optimized.cl +++ b/OpenCL/m09720_a0-optimized.cl @@ -13,104 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" - -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} +#include "inc_hash_md5.cl" void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) { @@ -220,7 +123,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[3] |= digest_t3[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -273,7 +176,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 62.. w3_t[3] |= digest_t2[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -326,7 +229,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 61.. w3_t[3] |= digest_t1[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -379,7 +282,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 60.. w3_t[3] = digest_t0[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -434,7 +337,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] |= digest_t3[0]; w3_t[3] = digest_t3[1]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = salt_buf_t0[0]; w0_t[1] = salt_buf_t0[1]; @@ -453,7 +356,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] = 21 * 16 * 8; w3_t[3] = 0; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -529,7 +432,7 @@ __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; @@ -647,7 +550,7 @@ __kernel void m09720_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; diff --git a/OpenCL/m09720_a1-optimized.cl b/OpenCL/m09720_a1-optimized.cl index 97f26c391..1e28f738f 100644 --- a/OpenCL/m09720_a1-optimized.cl +++ b/OpenCL/m09720_a1-optimized.cl @@ -11,104 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} +#include "inc_hash_md5.cl" void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) { @@ -218,7 +121,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[3] |= digest_t3[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -271,7 +174,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 62.. w3_t[3] |= digest_t2[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -324,7 +227,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 61.. w3_t[3] |= digest_t1[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -377,7 +280,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 60.. w3_t[3] = digest_t0[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -432,7 +335,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] |= digest_t3[0]; w3_t[3] = digest_t3[1]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = salt_buf_t0[0]; w0_t[1] = salt_buf_t0[1]; @@ -451,7 +354,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] = 21 * 16 * 8; w3_t[3] = 0; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -577,7 +480,7 @@ __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; @@ -745,7 +648,7 @@ __kernel void m09720_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; diff --git a/OpenCL/m09720_a3-optimized.cl b/OpenCL/m09720_a3-optimized.cl index 870b324a1..237a3cb9d 100644 --- a/OpenCL/m09720_a3-optimized.cl +++ b/OpenCL/m09720_a3-optimized.cl @@ -11,104 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} +#include "inc_hash_md5.cl" void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) { @@ -218,7 +121,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[3] |= digest_t3[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -271,7 +174,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 62.. w3_t[3] |= digest_t2[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -324,7 +227,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 61.. w3_t[3] |= digest_t1[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -377,7 +280,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 60.. w3_t[3] = digest_t0[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -432,7 +335,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] |= digest_t3[0]; w3_t[3] = digest_t3[1]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = salt_buf_t0[0]; w0_t[1] = salt_buf_t0[1]; @@ -451,7 +354,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] = 21 * 16 * 8; w3_t[3] = 0; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) @@ -519,7 +422,7 @@ void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest_pre); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; @@ -621,7 +524,7 @@ void m09720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest_pre); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; diff --git a/OpenCL/m09800_a0-optimized.cl b/OpenCL/m09800_a0-optimized.cl index f8a70ca7a..2c26ce3be 100644 --- a/OpenCL/m09800_a0-optimized.cl +++ b/OpenCL/m09800_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -138,134 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09800_a1-optimized.cl b/OpenCL/m09800_a1-optimized.cl index 0d0a44811..5064b8c61 100644 --- a/OpenCL/m09800_a1-optimized.cl +++ b/OpenCL/m09800_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -136,134 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09800_a3-optimized.cl b/OpenCL/m09800_a3-optimized.cl index 6392d5b6b..ffa1df0ec 100644 --- a/OpenCL/m09800_a3-optimized.cl +++ b/OpenCL/m09800_a3-optimized.cl @@ -9,6 +9,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -133,134 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - void m09800m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m09810_a0-optimized.cl b/OpenCL/m09810_a0-optimized.cl index 92f7c8a7a..8ee19da53 100644 --- a/OpenCL/m09810_a0-optimized.cl +++ b/OpenCL/m09810_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -138,134 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09810_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09810_a1-optimized.cl b/OpenCL/m09810_a1-optimized.cl index 7eb8619be..7d08a70d5 100644 --- a/OpenCL/m09810_a1-optimized.cl +++ b/OpenCL/m09810_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -136,134 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09810_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09810_a3-optimized.cl b/OpenCL/m09810_a3-optimized.cl index b2db6890c..fc8733b50 100644 --- a/OpenCL/m09810_a3-optimized.cl +++ b/OpenCL/m09810_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -136,134 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - void m09810m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m09820_a0-optimized.cl b/OpenCL/m09820_a0-optimized.cl index 367d2a32a..f895d7939 100644 --- a/OpenCL/m09820_a0-optimized.cl +++ b/OpenCL/m09820_a0-optimized.cl @@ -13,134 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -232,7 +105,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -257,7 +130,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; @@ -378,7 +251,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -403,7 +276,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; diff --git a/OpenCL/m09820_a1-optimized.cl b/OpenCL/m09820_a1-optimized.cl index 982cae319..113af54b0 100644 --- a/OpenCL/m09820_a1-optimized.cl +++ b/OpenCL/m09820_a1-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -280,7 +153,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -305,7 +178,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; @@ -476,7 +349,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -501,7 +374,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; diff --git a/OpenCL/m09820_a3-optimized.cl b/OpenCL/m09820_a3-optimized.cl index 34472dc8b..a63d26aa5 100644 --- a/OpenCL/m09820_a3-optimized.cl +++ b/OpenCL/m09820_a3-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -206,7 +79,7 @@ void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = digest[0]; w0_t[1] = digest[1]; @@ -231,7 +104,7 @@ void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; @@ -320,7 +193,7 @@ void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = digest[0]; w0_t[1] = digest[1]; @@ -345,7 +218,7 @@ void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; diff --git a/OpenCL/m09900_a0.cl b/OpenCL/m09900_a0.cl new file mode 100644 index 000000000..7e7459d11 --- /dev/null +++ b/OpenCL/m09900_a0.cl @@ -0,0 +1,130 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, 100); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, 100); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m09900_a1.cl b/OpenCL/m09900_a1.cl new file mode 100644 index 000000000..6ea1e9497 --- /dev/null +++ b/OpenCL/m09900_a1.cl @@ -0,0 +1,110 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, 100 - pws[gid].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, 100 - pws[gid].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m09900_a3.cl b/OpenCL/m09900_a3.cl new file mode 100644 index 000000000..28b1ebe9f --- /dev/null +++ b/OpenCL/m09900_a3.cl @@ -0,0 +1,140 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, 100); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, 100); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m10400_a0-optimized.cl b/OpenCL/m10400_a0-optimized.cl index 9c1fae1f3..f93a6ad62 100644 --- a/OpenCL/m10400_a0-optimized.cl +++ b/OpenCL/m10400_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { @@ -135,104 +136,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 ou return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10400_a1-optimized.cl b/OpenCL/m10400_a1-optimized.cl index cbcdf37d5..3f3a1f43b 100644 --- a/OpenCL/m10400_a1-optimized.cl +++ b/OpenCL/m10400_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { @@ -133,104 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 ou return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10400_a3-optimized.cl b/OpenCL/m10400_a3-optimized.cl index c1205fe5f..e4b04ee67 100644 --- a/OpenCL/m10400_a3-optimized.cl +++ b/OpenCL/m10400_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { @@ -133,104 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 ou return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m10400m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m10410_a0-optimized.cl b/OpenCL/m10410_a0-optimized.cl index 064a5d42d..c96e3b6d2 100644 --- a/OpenCL/m10410_a0-optimized.cl +++ b/OpenCL/m10410_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { diff --git a/OpenCL/m10410_a1-optimized.cl b/OpenCL/m10410_a1-optimized.cl index ac85ca10b..ea23844cc 100644 --- a/OpenCL/m10410_a1-optimized.cl +++ b/OpenCL/m10410_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { diff --git a/OpenCL/m10410_a3-optimized.cl b/OpenCL/m10410_a3-optimized.cl index db10bc257..46ad89173 100644 --- a/OpenCL/m10410_a3-optimized.cl +++ b/OpenCL/m10410_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { diff --git a/OpenCL/m10420_a0-optimized.cl b/OpenCL/m10420_a0-optimized.cl index 9eb143638..7295e2741 100644 --- a/OpenCL/m10420_a0-optimized.cl +++ b/OpenCL/m10420_a0-optimized.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -13,6 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32a padding[8] = { @@ -26,104 +27,6 @@ __constant u32a padding[8] = 0x7a695364 }; -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10420_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10420_a1-optimized.cl b/OpenCL/m10420_a1-optimized.cl index bf016c57c..1e64ca279 100644 --- a/OpenCL/m10420_a1-optimized.cl +++ b/OpenCL/m10420_a1-optimized.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -11,6 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32a padding[8] = { @@ -24,104 +25,6 @@ __constant u32a padding[8] = 0x7a695364 }; -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10420_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10420_a3-optimized.cl b/OpenCL/m10420_a3-optimized.cl index 0c1b76e5b..9df3068d8 100644 --- a/OpenCL/m10420_a3-optimized.cl +++ b/OpenCL/m10420_a3-optimized.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -11,6 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32a padding[8] = { @@ -24,104 +25,6 @@ __constant u32a padding[8] = 0x7a695364 }; -void md5_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** @@ -245,7 +148,7 @@ void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[2] = MD5M_C; digest[3] = MD5M_D; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = P; w0_t[1] = id_buf[0]; @@ -264,12 +167,12 @@ void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl w3_t[2] = 84 * 8; w3_t[3] = 0; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; - u32x c = 0; - u32x d = 0; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; + u32 c = 0; + u32 d = 0; COMPARE_M_SIMD (a, b, c, d); } @@ -410,7 +313,7 @@ void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[2] = MD5M_C; digest[3] = MD5M_D; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = P; w0_t[1] = id_buf[0]; @@ -429,12 +332,12 @@ void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl w3_t[2] = 84 * 8; w3_t[3] = 0; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; - u32x c = 0; - u32x d = 0; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; + u32 c = 0; + u32 d = 0; COMPARE_S_SIMD (a, b, c, d); } diff --git a/OpenCL/m10700.cl b/OpenCL/m10700.cl index a341d3d6e..693691652 100644 --- a/OpenCL/m10700.cl +++ b/OpenCL/m10700.cl @@ -1202,8 +1202,6 @@ __kernel void m10700_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m10800_a0.cl b/OpenCL/m10800_a0.cl index 1ae43d030..a84376f81 100644 --- a/OpenCL/m10800_a0.cl +++ b/OpenCL/m10800_a0.cl @@ -39,8 +39,6 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -59,10 +57,10 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_M_SCALAR (r0, r1, r2, r3); } @@ -104,8 +102,6 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -124,10 +120,10 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_S_SCALAR (r0, r1, r2, r3); } diff --git a/OpenCL/m10800_a1.cl b/OpenCL/m10800_a1.cl index 9f1eded3a..a666a6019 100644 --- a/OpenCL/m10800_a1.cl +++ b/OpenCL/m10800_a1.cl @@ -46,10 +46,10 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_M_SCALAR (r0, r1, r2, r3); } @@ -100,10 +100,10 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_S_SCALAR (r0, r1, r2, r3); } diff --git a/OpenCL/m10800_a3.cl b/OpenCL/m10800_a3.cl index 72ba67211..adff74a7f 100644 --- a/OpenCL/m10800_a3.cl +++ b/OpenCL/m10800_a3.cl @@ -37,8 +37,6 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m11000_a0.cl b/OpenCL/m11000_a0.cl new file mode 100644 index 000000000..cc7310547 --- /dev/null +++ b/OpenCL/m11000_a0.cl @@ -0,0 +1,138 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11000_a1.cl b/OpenCL/m11000_a1.cl new file mode 100644 index 000000000..291d1c783 --- /dev/null +++ b/OpenCL/m11000_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11000_a3.cl b/OpenCL/m11000_a3.cl new file mode 100644 index 000000000..b6e7b5f32 --- /dev/null +++ b/OpenCL/m11000_a3.cl @@ -0,0 +1,152 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11100_a0.cl b/OpenCL/m11100_a0.cl new file mode 100644 index 000000000..e2b372639 --- /dev/null +++ b/OpenCL/m11100_a0.cl @@ -0,0 +1,346 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1; + + md5_init (&ctx1); + + md5_update (&ctx1, w, pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1; + + md5_init (&ctx1); + + md5_update (&ctx1, w, pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11100_a1.cl b/OpenCL/m11100_a1.cl new file mode 100644 index 000000000..a2b234985 --- /dev/null +++ b/OpenCL/m11100_a1.cl @@ -0,0 +1,326 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + md5_ctx_t ctx0t; + + md5_init (&ctx0t); + + md5_update_global (&ctx0t, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0t; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + md5_ctx_t ctx0t; + + md5_init (&ctx0t); + + md5_update_global (&ctx0t, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0t; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11100_a3.cl b/OpenCL/m11100_a3.cl new file mode 100644 index 000000000..48216deed --- /dev/null +++ b/OpenCL/m11100_a3.cl @@ -0,0 +1,386 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_ctx_vector_t ctx1; + + md5_init_vector (&ctx1); + + md5_update_vector (&ctx1, w, pw_len); + + u32x s0[4]; + u32x s1[4]; + u32x s2[4]; + u32x s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_vector_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w2[0] = challenge; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_ctx_vector_t ctx1; + + md5_init_vector (&ctx1); + + md5_update_vector (&ctx1, w, pw_len); + + u32x s0[4]; + u32x s1[4]; + u32x s2[4]; + u32x s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_vector_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11200_a0.cl b/OpenCL/m11200_a0.cl new file mode 100644 index 000000000..1ba45efe8 --- /dev/null +++ b/OpenCL/m11200_a0.cl @@ -0,0 +1,274 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx2; + + sha1_init (&ctx2); + + sha1_update_swap (&ctx2, w, pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx2; + + sha1_init (&ctx2); + + sha1_update_swap (&ctx2, w, pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11200_a1.cl b/OpenCL/m11200_a1.cl new file mode 100644 index 000000000..ef5836abf --- /dev/null +++ b/OpenCL/m11200_a1.cl @@ -0,0 +1,254 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx2l; + + sha1_init (&ctx2l); + + sha1_update_global_swap (&ctx2l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx2 = ctx2l; + + sha1_update_global_swap (&ctx2, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx2l; + + sha1_init (&ctx2l); + + sha1_update_global_swap (&ctx2l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx2 = ctx2l; + + sha1_update_global_swap (&ctx2, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11200_a3.cl b/OpenCL/m11200_a3.cl new file mode 100644 index 000000000..a76f4d508 --- /dev/null +++ b/OpenCL/m11200_a3.cl @@ -0,0 +1,300 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector (&ctx2); + + sha1_update_vector (&ctx2, w, pw_len); + + sha1_final_vector (&ctx2); + + u32x a = ctx2.h[0]; + u32x b = ctx2.h[1]; + u32x c = ctx2.h[2]; + u32x d = ctx2.h[3]; + u32x e = ctx2.h[4]; + + const u32x a_sav = a; + const u32x b_sav = b; + const u32x c_sav = c; + const u32x d_sav = d; + const u32x e_sav = e; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final_vector (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final_vector (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector (&ctx2); + + sha1_update_vector (&ctx2, w, pw_len); + + sha1_final_vector (&ctx2); + + u32x a = ctx2.h[0]; + u32x b = ctx2.h[1]; + u32x c = ctx2.h[2]; + u32x d = ctx2.h[3]; + u32x e = ctx2.h[4]; + + const u32x a_sav = a; + const u32x b_sav = b; + const u32x c_sav = c; + const u32x d_sav = d; + const u32x e_sav = e; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final_vector (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final_vector (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11400_a0-optimized.cl b/OpenCL/m11400_a0-optimized.cl deleted file mode 100644 index 58d7f4d5c..000000000 --- a/OpenCL/m11400_a0-optimized.cl +++ /dev/null @@ -1,2261 +0,0 @@ -/** - * Author......: See docs/credits.txt - * License.....: MIT - */ - -//incompatible because of brances -//#define NEW_SIMD_CODE - -#include "inc_vendor.cl" -#include "inc_hash_constants.h" -#include "inc_hash_functions.cl" -#include "inc_types.cl" -#include "inc_common.cl" -#include "inc_rp.h" -#include "inc_rp.cl" -#include "inc_simd.cl" - -#if VECT_SIZE == 1 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) -#elif VECT_SIZE == 2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#elif VECT_SIZE == 4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#elif VECT_SIZE == 8 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) -#elif VECT_SIZE == 16 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) -#endif - -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) -{ - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - mod; - - u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - - u32x append4_t[4]; - - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; - - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } - #endif - - #ifdef IS_NV - - const int offset_minus_4 = 4 - mod; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - - u32x append4_t[4]; - - append4_t[0] = __byte_perm (append3[3], 0, selector); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - #endif - - switch (div) - { - case 0: block0[ 0] |= append0_t[0]; - block0[ 1] = append0_t[1]; - block0[ 2] = append0_t[2]; - block0[ 3] = append0_t[3]; - - block0[ 4] = append1_t[0]; - block0[ 5] = append1_t[1]; - block0[ 6] = append1_t[2]; - block0[ 7] = append1_t[3]; - - block0[ 8] = append2_t[0]; - block0[ 9] = append2_t[1]; - block0[10] = append2_t[2]; - block0[11] = append2_t[3]; - - block0[12] = append3_t[0]; - block0[13] = append3_t[1]; - block0[14] = append3_t[2]; - block0[15] = append3_t[3]; - - block1[ 0] = append4_t[0]; - block1[ 1] = append4_t[1]; - block1[ 2] = append4_t[2]; - block1[ 3] = append4_t[3]; - break; - - case 1: block0[ 1] |= append0_t[0]; - block0[ 2] = append0_t[1]; - block0[ 3] = append0_t[2]; - block0[ 4] = append0_t[3]; - - block0[ 5] = append1_t[0]; - block0[ 6] = append1_t[1]; - block0[ 7] = append1_t[2]; - block0[ 8] = append1_t[3]; - - block0[ 9] = append2_t[0]; - block0[10] = append2_t[1]; - block0[11] = append2_t[2]; - block0[12] = append2_t[3]; - - block0[13] = append3_t[0]; - block0[14] = append3_t[1]; - block0[15] = append3_t[2]; - block1[ 0] = append3_t[3]; - - block1[ 1] = append4_t[0]; - block1[ 2] = append4_t[1]; - block1[ 3] = append4_t[2]; - block1[ 4] = append4_t[3]; - break; - - case 2: block0[ 2] |= append0_t[0]; - block0[ 3] = append0_t[1]; - block0[ 4] = append0_t[2]; - block0[ 5] = append0_t[3]; - - block0[ 6] = append1_t[0]; - block0[ 7] = append1_t[1]; - block0[ 8] = append1_t[2]; - block0[ 9] = append1_t[3]; - - block0[10] = append2_t[0]; - block0[11] = append2_t[1]; - block0[12] = append2_t[2]; - block0[13] = append2_t[3]; - - block0[14] = append3_t[0]; - block0[15] = append3_t[1]; - block1[ 0] = append3_t[2]; - block1[ 1] = append3_t[3]; - - block1[ 2] = append4_t[0]; - block1[ 3] = append4_t[1]; - block1[ 4] = append4_t[2]; - block1[ 5] = append4_t[3]; - break; - - case 3: block0[ 3] |= append0_t[0]; - block0[ 4] = append0_t[1]; - block0[ 5] = append0_t[2]; - block0[ 6] = append0_t[3]; - - block0[ 7] = append1_t[0]; - block0[ 8] = append1_t[1]; - block0[ 9] = append1_t[2]; - block0[10] = append1_t[3]; - - block0[11] = append2_t[0]; - block0[12] = append2_t[1]; - block0[13] = append2_t[2]; - block0[14] = append2_t[3]; - - block0[15] = append3_t[0]; - block1[ 0] = append3_t[1]; - block1[ 1] = append3_t[2]; - block1[ 2] = append3_t[3]; - - block1[ 3] = append4_t[0]; - block1[ 4] = append4_t[1]; - block1[ 5] = append4_t[2]; - block1[ 6] = append4_t[3]; - break; - - case 4: block0[ 4] |= append0_t[0]; - block0[ 5] = append0_t[1]; - block0[ 6] = append0_t[2]; - block0[ 7] = append0_t[3]; - - block0[ 8] = append1_t[0]; - block0[ 9] = append1_t[1]; - block0[10] = append1_t[2]; - block0[11] = append1_t[3]; - - block0[12] = append2_t[0]; - block0[13] = append2_t[1]; - block0[14] = append2_t[2]; - block0[15] = append2_t[3]; - - block1[ 0] = append3_t[0]; - block1[ 1] = append3_t[1]; - block1[ 2] = append3_t[2]; - block1[ 3] = append3_t[3]; - - block1[ 4] = append4_t[0]; - block1[ 5] = append4_t[1]; - block1[ 6] = append4_t[2]; - block1[ 7] = append4_t[3]; - break; - - case 5: block0[ 5] |= append0_t[0]; - block0[ 6] = append0_t[1]; - block0[ 7] = append0_t[2]; - block0[ 8] = append0_t[3]; - - block0[ 9] = append1_t[0]; - block0[10] = append1_t[1]; - block0[11] = append1_t[2]; - block0[12] = append1_t[3]; - - block0[13] = append2_t[0]; - block0[14] = append2_t[1]; - block0[15] = append2_t[2]; - block1[ 0] = append2_t[3]; - - block1[ 1] = append3_t[0]; - block1[ 2] = append3_t[1]; - block1[ 3] = append3_t[2]; - block1[ 4] = append3_t[3]; - - block1[ 5] = append4_t[0]; - block1[ 6] = append4_t[1]; - block1[ 7] = append4_t[2]; - block1[ 8] = append4_t[3]; - break; - - case 6: block0[ 6] |= append0_t[0]; - block0[ 7] = append0_t[1]; - block0[ 8] = append0_t[2]; - block0[ 9] = append0_t[3]; - - block0[10] = append1_t[0]; - block0[11] = append1_t[1]; - block0[12] = append1_t[2]; - block0[13] = append1_t[3]; - - block0[14] = append2_t[0]; - block0[15] = append2_t[1]; - block1[ 0] = append2_t[2]; - block1[ 1] = append2_t[3]; - - block1[ 2] = append3_t[0]; - block1[ 3] = append3_t[1]; - block1[ 4] = append3_t[2]; - block1[ 5] = append3_t[3]; - - block1[ 6] = append4_t[0]; - block1[ 7] = append4_t[1]; - block1[ 8] = append4_t[2]; - block1[ 9] = append4_t[3]; - break; - - case 7: block0[ 7] |= append0_t[0]; - block0[ 8] = append0_t[1]; - block0[ 9] = append0_t[2]; - block0[10] = append0_t[3]; - - block0[11] = append1_t[0]; - block0[12] = append1_t[1]; - block0[13] = append1_t[2]; - block0[14] = append1_t[3]; - - block0[15] = append2_t[0]; - block1[ 0] = append2_t[1]; - block1[ 1] = append2_t[2]; - block1[ 2] = append2_t[3]; - - block1[ 3] = append3_t[0]; - block1[ 4] = append3_t[1]; - block1[ 5] = append3_t[2]; - block1[ 6] = append3_t[3]; - - block1[ 7] = append4_t[0]; - block1[ 8] = append4_t[1]; - block1[ 9] = append4_t[2]; - block1[10] = append4_t[3]; - break; - - case 8: block0[ 8] |= append0_t[0]; - block0[ 9] = append0_t[1]; - block0[10] = append0_t[2]; - block0[11] = append0_t[3]; - - block0[12] = append1_t[0]; - block0[13] = append1_t[1]; - block0[14] = append1_t[2]; - block0[15] = append1_t[3]; - - block1[ 0] = append2_t[0]; - block1[ 1] = append2_t[1]; - block1[ 2] = append2_t[2]; - block1[ 3] = append2_t[3]; - - block1[ 4] = append3_t[0]; - block1[ 5] = append3_t[1]; - block1[ 6] = append3_t[2]; - block1[ 7] = append3_t[3]; - - block1[ 8] = append4_t[0]; - block1[ 9] = append4_t[1]; - block1[10] = append4_t[2]; - block1[11] = append4_t[3]; - break; - - case 9: block0[ 9] |= append0_t[0]; - block0[10] = append0_t[1]; - block0[11] = append0_t[2]; - block0[12] = append0_t[3]; - - block0[13] = append1_t[0]; - block0[14] = append1_t[1]; - block0[15] = append1_t[2]; - block1[ 0] = append1_t[3]; - - block1[ 1] = append2_t[0]; - block1[ 2] = append2_t[1]; - block1[ 3] = append2_t[2]; - block1[ 4] = append2_t[3]; - - block1[ 5] = append3_t[0]; - block1[ 6] = append3_t[1]; - block1[ 7] = append3_t[2]; - block1[ 8] = append3_t[3]; - - block1[ 9] = append4_t[0]; - block1[10] = append4_t[1]; - block1[11] = append4_t[2]; - block1[12] = append4_t[3]; - break; - - case 10: block0[10] |= append0_t[0]; - block0[11] = append0_t[1]; - block0[12] = append0_t[2]; - block0[13] = append0_t[3]; - - block0[14] = append1_t[0]; - block0[15] = append1_t[1]; - block1[ 0] = append1_t[2]; - block1[ 1] = append1_t[3]; - - block1[ 2] = append2_t[0]; - block1[ 3] = append2_t[1]; - block1[ 4] = append2_t[2]; - block1[ 5] = append2_t[3]; - - block1[ 6] = append3_t[0]; - block1[ 7] = append3_t[1]; - block1[ 8] = append3_t[2]; - block1[ 9] = append3_t[3]; - - block1[10] = append4_t[0]; - block1[11] = append4_t[1]; - block1[12] = append4_t[2]; - block1[13] = append4_t[3]; - break; - - case 11: block0[11] |= append0_t[0]; - block0[12] = append0_t[1]; - block0[13] = append0_t[2]; - block0[14] = append0_t[3]; - - block0[15] = append1_t[0]; - block1[ 0] = append1_t[1]; - block1[ 1] = append1_t[2]; - block1[ 2] = append1_t[3]; - - block1[ 3] = append2_t[0]; - block1[ 4] = append2_t[1]; - block1[ 5] = append2_t[2]; - block1[ 6] = append2_t[3]; - - block1[ 7] = append3_t[0]; - block1[ 8] = append3_t[1]; - block1[ 9] = append3_t[2]; - block1[10] = append3_t[3]; - - block1[11] = append4_t[0]; - block1[12] = append4_t[1]; - block1[13] = append4_t[2]; - block1[14] = append4_t[3]; - break; - - case 12: block0[12] |= append0_t[0]; - block0[13] = append0_t[1]; - block0[14] = append0_t[2]; - block0[15] = append0_t[3]; - - block1[ 0] = append1_t[0]; - block1[ 1] = append1_t[1]; - block1[ 2] = append1_t[2]; - block1[ 3] = append1_t[3]; - - block1[ 4] = append2_t[0]; - block1[ 5] = append2_t[1]; - block1[ 6] = append2_t[2]; - block1[ 7] = append2_t[3]; - - block1[ 8] = append3_t[0]; - block1[ 9] = append3_t[1]; - block1[10] = append3_t[2]; - block1[11] = append3_t[3]; - - block1[12] = append4_t[0]; - block1[13] = append4_t[1]; - block1[14] = append4_t[2]; - block1[15] = append4_t[3]; - break; - - case 13: block0[13] |= append0_t[0]; - block0[14] = append0_t[1]; - block0[15] = append0_t[2]; - block1[ 0] = append0_t[3]; - - block1[ 1] = append1_t[0]; - block1[ 2] = append1_t[1]; - block1[ 3] = append1_t[2]; - block1[ 4] = append1_t[3]; - - block1[ 5] = append2_t[0]; - block1[ 6] = append2_t[1]; - block1[ 7] = append2_t[2]; - block1[ 8] = append2_t[3]; - - block1[ 9] = append3_t[0]; - block1[10] = append3_t[1]; - block1[11] = append3_t[2]; - block1[12] = append3_t[3]; - - block1[13] = append4_t[0]; - block1[14] = append4_t[1]; - block1[15] = append4_t[2]; - break; - - case 14: block0[14] |= append0_t[0]; - block0[15] = append0_t[1]; - block1[ 0] = append0_t[2]; - block1[ 1] = append0_t[3]; - - block1[ 2] = append1_t[0]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - - block1[ 6] = append2_t[0]; - block1[ 7] = append2_t[1]; - block1[ 8] = append2_t[2]; - block1[ 9] = append2_t[3]; - - block1[10] = append3_t[0]; - block1[11] = append3_t[1]; - block1[12] = append3_t[2]; - block1[13] = append3_t[3]; - - block1[14] = append4_t[0]; - block1[15] = append4_t[1]; - break; - - case 15: block0[15] |= append0_t[0]; - block1[ 0] = append0_t[1]; - block1[ 1] = append0_t[2]; - block1[ 2] = append0_t[3]; - - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append1_t[0]; - - block1[ 7] = append2_t[0]; - block1[ 8] = append2_t[1]; - block1[ 9] = append2_t[2]; - block1[10] = append2_t[3]; - - block1[11] = append3_t[0]; - block1[12] = append3_t[1]; - block1[13] = append3_t[2]; - block1[14] = append3_t[3]; - - block1[15] = append4_t[0]; - break; - - case 16: block1[ 0] |= append0_t[0]; - block1[ 1] = append0_t[1]; - block1[ 2] = append0_t[2]; - block1[ 3] = append0_t[3]; - - block1[ 4] = append1_t[0]; - block1[ 5] = append1_t[1]; - block1[ 6] = append1_t[2]; - block1[ 7] = append1_t[3]; - - block1[ 8] = append2_t[0]; - block1[ 9] = append2_t[1]; - block1[10] = append2_t[2]; - block1[11] = append2_t[3]; - - block1[12] = append3_t[0]; - block1[13] = append3_t[1]; - block1[14] = append3_t[2]; - block1[15] = append3_t[3]; - break; - - case 17: block1[ 1] |= append0_t[0]; - block1[ 2] = append0_t[1]; - block1[ 3] = append0_t[2]; - block1[ 4] = append0_t[3]; - - block1[ 5] = append1_t[0]; - block1[ 6] = append1_t[1]; - block1[ 7] = append1_t[2]; - block1[ 8] = append1_t[3]; - - block1[ 9] = append2_t[0]; - block1[10] = append2_t[1]; - block1[11] = append2_t[2]; - block1[12] = append2_t[3]; - - block1[13] = append3_t[0]; - block1[14] = append3_t[1]; - block1[15] = append3_t[2]; - break; - - case 18: block1[ 2] |= append0_t[0]; - block1[ 3] = append0_t[1]; - block1[ 4] = append0_t[2]; - block1[ 5] = append0_t[3]; - - block1[ 6] = append1_t[0]; - block1[ 7] = append1_t[1]; - block1[ 8] = append1_t[2]; - block1[ 9] = append1_t[3]; - - block1[10] = append2_t[0]; - block1[11] = append2_t[1]; - block1[12] = append2_t[2]; - block1[13] = append2_t[3]; - - block1[14] = append3_t[0]; - block1[15] = append3_t[1]; - break; - - case 19: block1[ 3] |= append0_t[0]; - block1[ 4] = append0_t[1]; - block1[ 5] = append0_t[2]; - block1[ 6] = append0_t[3]; - - block1[ 7] = append1_t[0]; - block1[ 8] = append1_t[1]; - block1[ 9] = append1_t[2]; - block1[10] = append1_t[3]; - - block1[11] = append2_t[0]; - block1[12] = append2_t[1]; - block1[13] = append2_t[2]; - block1[14] = append2_t[3]; - - block1[15] = append3_t[0]; - break; - - case 20: block1[ 4] |= append0_t[0]; - block1[ 5] = append0_t[1]; - block1[ 6] = append0_t[2]; - block1[ 7] = append0_t[3]; - - block1[ 8] = append1_t[0]; - block1[ 9] = append1_t[1]; - block1[10] = append1_t[2]; - block1[11] = append1_t[3]; - - block1[12] = append2_t[0]; - block1[13] = append2_t[1]; - block1[14] = append2_t[2]; - block1[15] = append2_t[3]; - break; - - case 21: block1[ 5] |= append0_t[0]; - block1[ 6] = append0_t[1]; - block1[ 7] = append0_t[2]; - block1[ 8] = append0_t[3]; - - block1[ 9] = append1_t[0]; - block1[10] = append1_t[1]; - block1[11] = append1_t[2]; - block1[12] = append1_t[3]; - - block1[13] = append2_t[0]; - block1[14] = append2_t[1]; - block1[15] = append2_t[2]; - break; - - case 22: block1[ 6] |= append0_t[0]; - block1[ 7] = append0_t[1]; - block1[ 8] = append0_t[2]; - block1[ 9] = append0_t[3]; - - block1[10] = append1_t[0]; - block1[11] = append1_t[1]; - block1[12] = append1_t[2]; - block1[13] = append1_t[3]; - - block1[14] = append2_t[0]; - block1[15] = append2_t[1]; - break; - - case 23: block1[ 7] |= append0_t[0]; - block1[ 8] = append0_t[1]; - block1[ 9] = append0_t[2]; - block1[10] = append0_t[3]; - - block1[11] = append1_t[0]; - block1[12] = append1_t[1]; - block1[13] = append1_t[2]; - block1[14] = append1_t[3]; - - block1[15] = append2_t[0]; - break; - - case 24: block1[ 8] |= append0_t[0]; - block1[ 9] = append0_t[1]; - block1[10] = append0_t[2]; - block1[11] = append0_t[3]; - - block1[12] = append1_t[0]; - block1[13] = append1_t[1]; - block1[14] = append1_t[2]; - block1[15] = append1_t[3]; - break; - - case 25: block1[ 9] |= append0_t[0]; - block1[10] = append0_t[1]; - block1[11] = append0_t[2]; - block1[12] = append0_t[3]; - - block1[13] = append1_t[0]; - block1[14] = append1_t[1]; - block1[15] = append1_t[2]; - break; - - case 26: block1[10] |= append0_t[0]; - block1[11] = append0_t[1]; - block1[12] = append0_t[2]; - block1[13] = append0_t[3]; - - block1[14] = append1_t[0]; - block1[15] = append1_t[1]; - break; - - case 27: block1[11] |= append0_t[0]; - block1[12] = append0_t[1]; - block1[13] = append0_t[2]; - block1[14] = append0_t[3]; - - block1[15] = append1_t[0]; - break; - - case 28: block1[12] |= append0_t[0]; - block1[13] = append0_t[1]; - block1[14] = append0_t[2]; - block1[15] = append0_t[3]; - break; - - case 29: block1[13] |= append0_t[0]; - block1[14] = append0_t[1]; - block1[15] = append0_t[2]; - break; - - case 30: block1[14] |= append0_t[0]; - block1[15] = append0_t[1]; - break; - } - - u32 new_len = block_len + append_len; - - return new_len; -} - -__kernel void m11400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[ 0]; - pw_buf0[1] = pws[gid].i[ 1]; - pw_buf0[2] = pws[gid].i[ 2]; - pw_buf0[3] = pws[gid].i[ 3]; - pw_buf1[0] = pws[gid].i[ 4]; - pw_buf1[1] = pws[gid].i[ 5]; - pw_buf1[2] = pws[gid].i[ 6]; - pw_buf1[3] = pws[gid].i[ 7]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - u32x w0[4] = { 0 }; - u32x w1[4] = { 0 }; - u32x w2[4] = { 0 }; - u32x w3[4] = { 0 }; - - const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - - append_0x80_2x4_VV (w0, w1, out_len); - - const u32x pw_salt_len = salt_len + out_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, out_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -__kernel void m11400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[ 0]; - pw_buf0[1] = pws[gid].i[ 1]; - pw_buf0[2] = pws[gid].i[ 2]; - pw_buf0[3] = pws[gid].i[ 3]; - pw_buf1[0] = pws[gid].i[ 4]; - pw_buf1[1] = pws[gid].i[ 5]; - pw_buf1[2] = pws[gid].i[ 6]; - pw_buf1[3] = pws[gid].i[ 7]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - u32x w0[4] = { 0 }; - u32x w1[4] = { 0 }; - u32x w2[4] = { 0 }; - u32x w3[4] = { 0 }; - - const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - - append_0x80_2x4_VV (w0, w1, out_len); - - const u32x pw_salt_len = salt_len + out_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, out_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -__kernel void m11400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m11400_a0.cl b/OpenCL/m11400_a0.cl new file mode 100644 index 000000000..72541c78c --- /dev/null +++ b/OpenCL/m11400_a0.cl @@ -0,0 +1,250 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1 = ctx0; + + md5_update (&ctx1, w, pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1 = ctx0; + + md5_update (&ctx1, w, pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11400_a1-optimized.cl b/OpenCL/m11400_a1-optimized.cl deleted file mode 100644 index 4fc47020e..000000000 --- a/OpenCL/m11400_a1-optimized.cl +++ /dev/null @@ -1,2375 +0,0 @@ -/** - * Author......: See docs/credits.txt - * License.....: MIT - */ - -//incompatible because of brances -//#define NEW_SIMD_CODE - -#include "inc_vendor.cl" -#include "inc_hash_constants.h" -#include "inc_hash_functions.cl" -#include "inc_types.cl" -#include "inc_common.cl" -#include "inc_simd.cl" - -#if VECT_SIZE == 1 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) -#elif VECT_SIZE == 2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#elif VECT_SIZE == 4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#elif VECT_SIZE == 8 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) -#elif VECT_SIZE == 16 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) -#endif - -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) -{ - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - mod; - - u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - - u32x append4_t[4]; - - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; - - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } - #endif - - #ifdef IS_NV - - const int offset_minus_4 = 4 - mod; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - - u32x append4_t[4]; - - append4_t[0] = __byte_perm (append3[3], 0, selector); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - #endif - - switch (div) - { - case 0: block0[ 0] |= append0_t[0]; - block0[ 1] = append0_t[1]; - block0[ 2] = append0_t[2]; - block0[ 3] = append0_t[3]; - - block0[ 4] = append1_t[0]; - block0[ 5] = append1_t[1]; - block0[ 6] = append1_t[2]; - block0[ 7] = append1_t[3]; - - block0[ 8] = append2_t[0]; - block0[ 9] = append2_t[1]; - block0[10] = append2_t[2]; - block0[11] = append2_t[3]; - - block0[12] = append3_t[0]; - block0[13] = append3_t[1]; - block0[14] = append3_t[2]; - block0[15] = append3_t[3]; - - block1[ 0] = append4_t[0]; - block1[ 1] = append4_t[1]; - block1[ 2] = append4_t[2]; - block1[ 3] = append4_t[3]; - break; - - case 1: block0[ 1] |= append0_t[0]; - block0[ 2] = append0_t[1]; - block0[ 3] = append0_t[2]; - block0[ 4] = append0_t[3]; - - block0[ 5] = append1_t[0]; - block0[ 6] = append1_t[1]; - block0[ 7] = append1_t[2]; - block0[ 8] = append1_t[3]; - - block0[ 9] = append2_t[0]; - block0[10] = append2_t[1]; - block0[11] = append2_t[2]; - block0[12] = append2_t[3]; - - block0[13] = append3_t[0]; - block0[14] = append3_t[1]; - block0[15] = append3_t[2]; - block1[ 0] = append3_t[3]; - - block1[ 1] = append4_t[0]; - block1[ 2] = append4_t[1]; - block1[ 3] = append4_t[2]; - block1[ 4] = append4_t[3]; - break; - - case 2: block0[ 2] |= append0_t[0]; - block0[ 3] = append0_t[1]; - block0[ 4] = append0_t[2]; - block0[ 5] = append0_t[3]; - - block0[ 6] = append1_t[0]; - block0[ 7] = append1_t[1]; - block0[ 8] = append1_t[2]; - block0[ 9] = append1_t[3]; - - block0[10] = append2_t[0]; - block0[11] = append2_t[1]; - block0[12] = append2_t[2]; - block0[13] = append2_t[3]; - - block0[14] = append3_t[0]; - block0[15] = append3_t[1]; - block1[ 0] = append3_t[2]; - block1[ 1] = append3_t[3]; - - block1[ 2] = append4_t[0]; - block1[ 3] = append4_t[1]; - block1[ 4] = append4_t[2]; - block1[ 5] = append4_t[3]; - break; - - case 3: block0[ 3] |= append0_t[0]; - block0[ 4] = append0_t[1]; - block0[ 5] = append0_t[2]; - block0[ 6] = append0_t[3]; - - block0[ 7] = append1_t[0]; - block0[ 8] = append1_t[1]; - block0[ 9] = append1_t[2]; - block0[10] = append1_t[3]; - - block0[11] = append2_t[0]; - block0[12] = append2_t[1]; - block0[13] = append2_t[2]; - block0[14] = append2_t[3]; - - block0[15] = append3_t[0]; - block1[ 0] = append3_t[1]; - block1[ 1] = append3_t[2]; - block1[ 2] = append3_t[3]; - - block1[ 3] = append4_t[0]; - block1[ 4] = append4_t[1]; - block1[ 5] = append4_t[2]; - block1[ 6] = append4_t[3]; - break; - - case 4: block0[ 4] |= append0_t[0]; - block0[ 5] = append0_t[1]; - block0[ 6] = append0_t[2]; - block0[ 7] = append0_t[3]; - - block0[ 8] = append1_t[0]; - block0[ 9] = append1_t[1]; - block0[10] = append1_t[2]; - block0[11] = append1_t[3]; - - block0[12] = append2_t[0]; - block0[13] = append2_t[1]; - block0[14] = append2_t[2]; - block0[15] = append2_t[3]; - - block1[ 0] = append3_t[0]; - block1[ 1] = append3_t[1]; - block1[ 2] = append3_t[2]; - block1[ 3] = append3_t[3]; - - block1[ 4] = append4_t[0]; - block1[ 5] = append4_t[1]; - block1[ 6] = append4_t[2]; - block1[ 7] = append4_t[3]; - break; - - case 5: block0[ 5] |= append0_t[0]; - block0[ 6] = append0_t[1]; - block0[ 7] = append0_t[2]; - block0[ 8] = append0_t[3]; - - block0[ 9] = append1_t[0]; - block0[10] = append1_t[1]; - block0[11] = append1_t[2]; - block0[12] = append1_t[3]; - - block0[13] = append2_t[0]; - block0[14] = append2_t[1]; - block0[15] = append2_t[2]; - block1[ 0] = append2_t[3]; - - block1[ 1] = append3_t[0]; - block1[ 2] = append3_t[1]; - block1[ 3] = append3_t[2]; - block1[ 4] = append3_t[3]; - - block1[ 5] = append4_t[0]; - block1[ 6] = append4_t[1]; - block1[ 7] = append4_t[2]; - block1[ 8] = append4_t[3]; - break; - - case 6: block0[ 6] |= append0_t[0]; - block0[ 7] = append0_t[1]; - block0[ 8] = append0_t[2]; - block0[ 9] = append0_t[3]; - - block0[10] = append1_t[0]; - block0[11] = append1_t[1]; - block0[12] = append1_t[2]; - block0[13] = append1_t[3]; - - block0[14] = append2_t[0]; - block0[15] = append2_t[1]; - block1[ 0] = append2_t[2]; - block1[ 1] = append2_t[3]; - - block1[ 2] = append3_t[0]; - block1[ 3] = append3_t[1]; - block1[ 4] = append3_t[2]; - block1[ 5] = append3_t[3]; - - block1[ 6] = append4_t[0]; - block1[ 7] = append4_t[1]; - block1[ 8] = append4_t[2]; - block1[ 9] = append4_t[3]; - break; - - case 7: block0[ 7] |= append0_t[0]; - block0[ 8] = append0_t[1]; - block0[ 9] = append0_t[2]; - block0[10] = append0_t[3]; - - block0[11] = append1_t[0]; - block0[12] = append1_t[1]; - block0[13] = append1_t[2]; - block0[14] = append1_t[3]; - - block0[15] = append2_t[0]; - block1[ 0] = append2_t[1]; - block1[ 1] = append2_t[2]; - block1[ 2] = append2_t[3]; - - block1[ 3] = append3_t[0]; - block1[ 4] = append3_t[1]; - block1[ 5] = append3_t[2]; - block1[ 6] = append3_t[3]; - - block1[ 7] = append4_t[0]; - block1[ 8] = append4_t[1]; - block1[ 9] = append4_t[2]; - block1[10] = append4_t[3]; - break; - - case 8: block0[ 8] |= append0_t[0]; - block0[ 9] = append0_t[1]; - block0[10] = append0_t[2]; - block0[11] = append0_t[3]; - - block0[12] = append1_t[0]; - block0[13] = append1_t[1]; - block0[14] = append1_t[2]; - block0[15] = append1_t[3]; - - block1[ 0] = append2_t[0]; - block1[ 1] = append2_t[1]; - block1[ 2] = append2_t[2]; - block1[ 3] = append2_t[3]; - - block1[ 4] = append3_t[0]; - block1[ 5] = append3_t[1]; - block1[ 6] = append3_t[2]; - block1[ 7] = append3_t[3]; - - block1[ 8] = append4_t[0]; - block1[ 9] = append4_t[1]; - block1[10] = append4_t[2]; - block1[11] = append4_t[3]; - break; - - case 9: block0[ 9] |= append0_t[0]; - block0[10] = append0_t[1]; - block0[11] = append0_t[2]; - block0[12] = append0_t[3]; - - block0[13] = append1_t[0]; - block0[14] = append1_t[1]; - block0[15] = append1_t[2]; - block1[ 0] = append1_t[3]; - - block1[ 1] = append2_t[0]; - block1[ 2] = append2_t[1]; - block1[ 3] = append2_t[2]; - block1[ 4] = append2_t[3]; - - block1[ 5] = append3_t[0]; - block1[ 6] = append3_t[1]; - block1[ 7] = append3_t[2]; - block1[ 8] = append3_t[3]; - - block1[ 9] = append4_t[0]; - block1[10] = append4_t[1]; - block1[11] = append4_t[2]; - block1[12] = append4_t[3]; - break; - - case 10: block0[10] |= append0_t[0]; - block0[11] = append0_t[1]; - block0[12] = append0_t[2]; - block0[13] = append0_t[3]; - - block0[14] = append1_t[0]; - block0[15] = append1_t[1]; - block1[ 0] = append1_t[2]; - block1[ 1] = append1_t[3]; - - block1[ 2] = append2_t[0]; - block1[ 3] = append2_t[1]; - block1[ 4] = append2_t[2]; - block1[ 5] = append2_t[3]; - - block1[ 6] = append3_t[0]; - block1[ 7] = append3_t[1]; - block1[ 8] = append3_t[2]; - block1[ 9] = append3_t[3]; - - block1[10] = append4_t[0]; - block1[11] = append4_t[1]; - block1[12] = append4_t[2]; - block1[13] = append4_t[3]; - break; - - case 11: block0[11] |= append0_t[0]; - block0[12] = append0_t[1]; - block0[13] = append0_t[2]; - block0[14] = append0_t[3]; - - block0[15] = append1_t[0]; - block1[ 0] = append1_t[1]; - block1[ 1] = append1_t[2]; - block1[ 2] = append1_t[3]; - - block1[ 3] = append2_t[0]; - block1[ 4] = append2_t[1]; - block1[ 5] = append2_t[2]; - block1[ 6] = append2_t[3]; - - block1[ 7] = append3_t[0]; - block1[ 8] = append3_t[1]; - block1[ 9] = append3_t[2]; - block1[10] = append3_t[3]; - - block1[11] = append4_t[0]; - block1[12] = append4_t[1]; - block1[13] = append4_t[2]; - block1[14] = append4_t[3]; - break; - - case 12: block0[12] |= append0_t[0]; - block0[13] = append0_t[1]; - block0[14] = append0_t[2]; - block0[15] = append0_t[3]; - - block1[ 0] = append1_t[0]; - block1[ 1] = append1_t[1]; - block1[ 2] = append1_t[2]; - block1[ 3] = append1_t[3]; - - block1[ 4] = append2_t[0]; - block1[ 5] = append2_t[1]; - block1[ 6] = append2_t[2]; - block1[ 7] = append2_t[3]; - - block1[ 8] = append3_t[0]; - block1[ 9] = append3_t[1]; - block1[10] = append3_t[2]; - block1[11] = append3_t[3]; - - block1[12] = append4_t[0]; - block1[13] = append4_t[1]; - block1[14] = append4_t[2]; - block1[15] = append4_t[3]; - break; - - case 13: block0[13] |= append0_t[0]; - block0[14] = append0_t[1]; - block0[15] = append0_t[2]; - block1[ 0] = append0_t[3]; - - block1[ 1] = append1_t[0]; - block1[ 2] = append1_t[1]; - block1[ 3] = append1_t[2]; - block1[ 4] = append1_t[3]; - - block1[ 5] = append2_t[0]; - block1[ 6] = append2_t[1]; - block1[ 7] = append2_t[2]; - block1[ 8] = append2_t[3]; - - block1[ 9] = append3_t[0]; - block1[10] = append3_t[1]; - block1[11] = append3_t[2]; - block1[12] = append3_t[3]; - - block1[13] = append4_t[0]; - block1[14] = append4_t[1]; - block1[15] = append4_t[2]; - break; - - case 14: block0[14] |= append0_t[0]; - block0[15] = append0_t[1]; - block1[ 0] = append0_t[2]; - block1[ 1] = append0_t[3]; - - block1[ 2] = append1_t[0]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - - block1[ 6] = append2_t[0]; - block1[ 7] = append2_t[1]; - block1[ 8] = append2_t[2]; - block1[ 9] = append2_t[3]; - - block1[10] = append3_t[0]; - block1[11] = append3_t[1]; - block1[12] = append3_t[2]; - block1[13] = append3_t[3]; - - block1[14] = append4_t[0]; - block1[15] = append4_t[1]; - break; - - case 15: block0[15] |= append0_t[0]; - block1[ 0] = append0_t[1]; - block1[ 1] = append0_t[2]; - block1[ 2] = append0_t[3]; - - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append1_t[0]; - - block1[ 7] = append2_t[0]; - block1[ 8] = append2_t[1]; - block1[ 9] = append2_t[2]; - block1[10] = append2_t[3]; - - block1[11] = append3_t[0]; - block1[12] = append3_t[1]; - block1[13] = append3_t[2]; - block1[14] = append3_t[3]; - - block1[15] = append4_t[0]; - break; - - case 16: block1[ 0] |= append0_t[0]; - block1[ 1] = append0_t[1]; - block1[ 2] = append0_t[2]; - block1[ 3] = append0_t[3]; - - block1[ 4] = append1_t[0]; - block1[ 5] = append1_t[1]; - block1[ 6] = append1_t[2]; - block1[ 7] = append1_t[3]; - - block1[ 8] = append2_t[0]; - block1[ 9] = append2_t[1]; - block1[10] = append2_t[2]; - block1[11] = append2_t[3]; - - block1[12] = append3_t[0]; - block1[13] = append3_t[1]; - block1[14] = append3_t[2]; - block1[15] = append3_t[3]; - break; - - case 17: block1[ 1] |= append0_t[0]; - block1[ 2] = append0_t[1]; - block1[ 3] = append0_t[2]; - block1[ 4] = append0_t[3]; - - block1[ 5] = append1_t[0]; - block1[ 6] = append1_t[1]; - block1[ 7] = append1_t[2]; - block1[ 8] = append1_t[3]; - - block1[ 9] = append2_t[0]; - block1[10] = append2_t[1]; - block1[11] = append2_t[2]; - block1[12] = append2_t[3]; - - block1[13] = append3_t[0]; - block1[14] = append3_t[1]; - block1[15] = append3_t[2]; - break; - - case 18: block1[ 2] |= append0_t[0]; - block1[ 3] = append0_t[1]; - block1[ 4] = append0_t[2]; - block1[ 5] = append0_t[3]; - - block1[ 6] = append1_t[0]; - block1[ 7] = append1_t[1]; - block1[ 8] = append1_t[2]; - block1[ 9] = append1_t[3]; - - block1[10] = append2_t[0]; - block1[11] = append2_t[1]; - block1[12] = append2_t[2]; - block1[13] = append2_t[3]; - - block1[14] = append3_t[0]; - block1[15] = append3_t[1]; - break; - - case 19: block1[ 3] |= append0_t[0]; - block1[ 4] = append0_t[1]; - block1[ 5] = append0_t[2]; - block1[ 6] = append0_t[3]; - - block1[ 7] = append1_t[0]; - block1[ 8] = append1_t[1]; - block1[ 9] = append1_t[2]; - block1[10] = append1_t[3]; - - block1[11] = append2_t[0]; - block1[12] = append2_t[1]; - block1[13] = append2_t[2]; - block1[14] = append2_t[3]; - - block1[15] = append3_t[0]; - break; - - case 20: block1[ 4] |= append0_t[0]; - block1[ 5] = append0_t[1]; - block1[ 6] = append0_t[2]; - block1[ 7] = append0_t[3]; - - block1[ 8] = append1_t[0]; - block1[ 9] = append1_t[1]; - block1[10] = append1_t[2]; - block1[11] = append1_t[3]; - - block1[12] = append2_t[0]; - block1[13] = append2_t[1]; - block1[14] = append2_t[2]; - block1[15] = append2_t[3]; - break; - - case 21: block1[ 5] |= append0_t[0]; - block1[ 6] = append0_t[1]; - block1[ 7] = append0_t[2]; - block1[ 8] = append0_t[3]; - - block1[ 9] = append1_t[0]; - block1[10] = append1_t[1]; - block1[11] = append1_t[2]; - block1[12] = append1_t[3]; - - block1[13] = append2_t[0]; - block1[14] = append2_t[1]; - block1[15] = append2_t[2]; - break; - - case 22: block1[ 6] |= append0_t[0]; - block1[ 7] = append0_t[1]; - block1[ 8] = append0_t[2]; - block1[ 9] = append0_t[3]; - - block1[10] = append1_t[0]; - block1[11] = append1_t[1]; - block1[12] = append1_t[2]; - block1[13] = append1_t[3]; - - block1[14] = append2_t[0]; - block1[15] = append2_t[1]; - break; - - case 23: block1[ 7] |= append0_t[0]; - block1[ 8] = append0_t[1]; - block1[ 9] = append0_t[2]; - block1[10] = append0_t[3]; - - block1[11] = append1_t[0]; - block1[12] = append1_t[1]; - block1[13] = append1_t[2]; - block1[14] = append1_t[3]; - - block1[15] = append2_t[0]; - break; - - case 24: block1[ 8] |= append0_t[0]; - block1[ 9] = append0_t[1]; - block1[10] = append0_t[2]; - block1[11] = append0_t[3]; - - block1[12] = append1_t[0]; - block1[13] = append1_t[1]; - block1[14] = append1_t[2]; - block1[15] = append1_t[3]; - break; - - case 25: block1[ 9] |= append0_t[0]; - block1[10] = append0_t[1]; - block1[11] = append0_t[2]; - block1[12] = append0_t[3]; - - block1[13] = append1_t[0]; - block1[14] = append1_t[1]; - block1[15] = append1_t[2]; - break; - - case 26: block1[10] |= append0_t[0]; - block1[11] = append0_t[1]; - block1[12] = append0_t[2]; - block1[13] = append0_t[3]; - - block1[14] = append1_t[0]; - block1[15] = append1_t[1]; - break; - - case 27: block1[11] |= append0_t[0]; - block1[12] = append0_t[1]; - block1[13] = append0_t[2]; - block1[14] = append0_t[3]; - - block1[15] = append1_t[0]; - break; - - case 28: block1[12] |= append0_t[0]; - block1[13] = append0_t[1]; - block1[14] = append0_t[2]; - block1[15] = append0_t[3]; - break; - - case 29: block1[13] |= append0_t[0]; - block1[14] = append0_t[1]; - block1[15] = append0_t[2]; - break; - - case 30: block1[14] |= append0_t[0]; - block1[15] = append0_t[1]; - break; - } - - u32 new_len = block_len + append_len; - - return new_len; -} - -__kernel void m11400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[0]; - pw_buf0[1] = pws[gid].i[1]; - pw_buf0[2] = pws[gid].i[2]; - pw_buf0[3] = pws[gid].i[3]; - pw_buf1[0] = pws[gid].i[4]; - pw_buf1[1] = pws[gid].i[5]; - pw_buf1[2] = pws[gid].i[6]; - pw_buf1[3] = pws[gid].i[7]; - - const u32 pw_l_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - /** - * concat password candidate - */ - - u32x wordl0[4] = { 0 }; - u32x wordl1[4] = { 0 }; - u32x wordl2[4] = { 0 }; - u32x wordl3[4] = { 0 }; - - wordl0[0] = pw_buf0[0]; - wordl0[1] = pw_buf0[1]; - wordl0[2] = pw_buf0[2]; - wordl0[3] = pw_buf0[3]; - wordl1[0] = pw_buf1[0]; - wordl1[1] = pw_buf1[1]; - wordl1[2] = pw_buf1[2]; - wordl1[3] = pw_buf1[3]; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); - - if (combs_mode == COMBINATOR_MODE_BASE_LEFT) - { - switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); - } - else - { - switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); - } - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = wordl0[0] | wordr0[0]; - w0[1] = wordl0[1] | wordr0[1]; - w0[2] = wordl0[2] | wordr0[2]; - w0[3] = wordl0[3] | wordr0[3]; - w1[0] = wordl1[0] | wordr1[0]; - w1[1] = wordl1[1] | wordr1[1]; - w1[2] = wordl1[2] | wordr1[2]; - w1[3] = wordl1[3] | wordr1[3]; - w2[0] = wordl2[0] | wordr2[0]; - w2[1] = wordl2[1] | wordr2[1]; - w2[2] = wordl2[2] | wordr2[2]; - w2[3] = wordl2[3] | wordr2[3]; - w3[0] = wordl3[0] | wordr3[0]; - w3[1] = wordl3[1] | wordr3[1]; - w3[2] = wordl3[2] | wordr3[2]; - w3[3] = wordl3[3] | wordr3[3]; - - const u32x pw_salt_len = salt_len + pw_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -__kernel void m11400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[0]; - pw_buf0[1] = pws[gid].i[1]; - pw_buf0[2] = pws[gid].i[2]; - pw_buf0[3] = pws[gid].i[3]; - pw_buf1[0] = pws[gid].i[4]; - pw_buf1[1] = pws[gid].i[5]; - pw_buf1[2] = pws[gid].i[6]; - pw_buf1[3] = pws[gid].i[7]; - - const u32 pw_l_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - /** - * concat password candidate - */ - - u32x wordl0[4] = { 0 }; - u32x wordl1[4] = { 0 }; - u32x wordl2[4] = { 0 }; - u32x wordl3[4] = { 0 }; - - wordl0[0] = pw_buf0[0]; - wordl0[1] = pw_buf0[1]; - wordl0[2] = pw_buf0[2]; - wordl0[3] = pw_buf0[3]; - wordl1[0] = pw_buf1[0]; - wordl1[1] = pw_buf1[1]; - wordl1[2] = pw_buf1[2]; - wordl1[3] = pw_buf1[3]; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); - - if (combs_mode == COMBINATOR_MODE_BASE_LEFT) - { - switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); - } - else - { - switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); - } - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = wordl0[0] | wordr0[0]; - w0[1] = wordl0[1] | wordr0[1]; - w0[2] = wordl0[2] | wordr0[2]; - w0[3] = wordl0[3] | wordr0[3]; - w1[0] = wordl1[0] | wordr1[0]; - w1[1] = wordl1[1] | wordr1[1]; - w1[2] = wordl1[2] | wordr1[2]; - w1[3] = wordl1[3] | wordr1[3]; - w2[0] = wordl2[0] | wordr2[0]; - w2[1] = wordl2[1] | wordr2[1]; - w2[2] = wordl2[2] | wordr2[2]; - w2[3] = wordl2[3] | wordr2[3]; - w3[0] = wordl3[0] | wordr3[0]; - w3[1] = wordl3[1] | wordr3[1]; - w3[2] = wordl3[2] | wordr3[2]; - w3[3] = wordl3[3] | wordr3[3]; - - const u32x pw_salt_len = salt_len + pw_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -__kernel void m11400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m11400_a1.cl b/OpenCL/m11400_a1.cl new file mode 100644 index 000000000..67b13c3bc --- /dev/null +++ b/OpenCL/m11400_a1.cl @@ -0,0 +1,226 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11400_a3-optimized.cl b/OpenCL/m11400_a3-optimized.cl deleted file mode 100644 index efcffb31e..000000000 --- a/OpenCL/m11400_a3-optimized.cl +++ /dev/null @@ -1,6043 +0,0 @@ -/** - * Author......: See docs/credits.txt - * License.....: MIT - */ - -//incompatible because of brances -//#define NEW_SIMD_CODE - -#include "inc_vendor.cl" -#include "inc_hash_constants.h" -#include "inc_hash_functions.cl" -#include "inc_types.cl" -#include "inc_common.cl" -#include "inc_simd.cl" - -#if VECT_SIZE == 1 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) -#elif VECT_SIZE == 2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#elif VECT_SIZE == 4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#elif VECT_SIZE == 8 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) -#elif VECT_SIZE == 16 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) -#endif - -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) -{ - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - mod; - - u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - - u32x append4_t[4]; - - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; - - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } - #endif - - #ifdef IS_NV - - const int offset_minus_4 = 4 - mod; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - - u32x append4_t[4]; - - append4_t[0] = __byte_perm (append3[3], 0, selector); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - #endif - - switch (div) - { - case 0: block0[ 0] |= append0_t[0]; - block0[ 1] = append0_t[1]; - block0[ 2] = append0_t[2]; - block0[ 3] = append0_t[3]; - - block0[ 4] = append1_t[0]; - block0[ 5] = append1_t[1]; - block0[ 6] = append1_t[2]; - block0[ 7] = append1_t[3]; - - block0[ 8] = append2_t[0]; - block0[ 9] = append2_t[1]; - block0[10] = append2_t[2]; - block0[11] = append2_t[3]; - - block0[12] = append3_t[0]; - block0[13] = append3_t[1]; - block0[14] = append3_t[2]; - block0[15] = append3_t[3]; - - block1[ 0] = append4_t[0]; - block1[ 1] = append4_t[1]; - block1[ 2] = append4_t[2]; - block1[ 3] = append4_t[3]; - break; - - case 1: block0[ 1] |= append0_t[0]; - block0[ 2] = append0_t[1]; - block0[ 3] = append0_t[2]; - block0[ 4] = append0_t[3]; - - block0[ 5] = append1_t[0]; - block0[ 6] = append1_t[1]; - block0[ 7] = append1_t[2]; - block0[ 8] = append1_t[3]; - - block0[ 9] = append2_t[0]; - block0[10] = append2_t[1]; - block0[11] = append2_t[2]; - block0[12] = append2_t[3]; - - block0[13] = append3_t[0]; - block0[14] = append3_t[1]; - block0[15] = append3_t[2]; - block1[ 0] = append3_t[3]; - - block1[ 1] = append4_t[0]; - block1[ 2] = append4_t[1]; - block1[ 3] = append4_t[2]; - block1[ 4] = append4_t[3]; - break; - - case 2: block0[ 2] |= append0_t[0]; - block0[ 3] = append0_t[1]; - block0[ 4] = append0_t[2]; - block0[ 5] = append0_t[3]; - - block0[ 6] = append1_t[0]; - block0[ 7] = append1_t[1]; - block0[ 8] = append1_t[2]; - block0[ 9] = append1_t[3]; - - block0[10] = append2_t[0]; - block0[11] = append2_t[1]; - block0[12] = append2_t[2]; - block0[13] = append2_t[3]; - - block0[14] = append3_t[0]; - block0[15] = append3_t[1]; - block1[ 0] = append3_t[2]; - block1[ 1] = append3_t[3]; - - block1[ 2] = append4_t[0]; - block1[ 3] = append4_t[1]; - block1[ 4] = append4_t[2]; - block1[ 5] = append4_t[3]; - break; - - case 3: block0[ 3] |= append0_t[0]; - block0[ 4] = append0_t[1]; - block0[ 5] = append0_t[2]; - block0[ 6] = append0_t[3]; - - block0[ 7] = append1_t[0]; - block0[ 8] = append1_t[1]; - block0[ 9] = append1_t[2]; - block0[10] = append1_t[3]; - - block0[11] = append2_t[0]; - block0[12] = append2_t[1]; - block0[13] = append2_t[2]; - block0[14] = append2_t[3]; - - block0[15] = append3_t[0]; - block1[ 0] = append3_t[1]; - block1[ 1] = append3_t[2]; - block1[ 2] = append3_t[3]; - - block1[ 3] = append4_t[0]; - block1[ 4] = append4_t[1]; - block1[ 5] = append4_t[2]; - block1[ 6] = append4_t[3]; - break; - - case 4: block0[ 4] |= append0_t[0]; - block0[ 5] = append0_t[1]; - block0[ 6] = append0_t[2]; - block0[ 7] = append0_t[3]; - - block0[ 8] = append1_t[0]; - block0[ 9] = append1_t[1]; - block0[10] = append1_t[2]; - block0[11] = append1_t[3]; - - block0[12] = append2_t[0]; - block0[13] = append2_t[1]; - block0[14] = append2_t[2]; - block0[15] = append2_t[3]; - - block1[ 0] = append3_t[0]; - block1[ 1] = append3_t[1]; - block1[ 2] = append3_t[2]; - block1[ 3] = append3_t[3]; - - block1[ 4] = append4_t[0]; - block1[ 5] = append4_t[1]; - block1[ 6] = append4_t[2]; - block1[ 7] = append4_t[3]; - break; - - case 5: block0[ 5] |= append0_t[0]; - block0[ 6] = append0_t[1]; - block0[ 7] = append0_t[2]; - block0[ 8] = append0_t[3]; - - block0[ 9] = append1_t[0]; - block0[10] = append1_t[1]; - block0[11] = append1_t[2]; - block0[12] = append1_t[3]; - - block0[13] = append2_t[0]; - block0[14] = append2_t[1]; - block0[15] = append2_t[2]; - block1[ 0] = append2_t[3]; - - block1[ 1] = append3_t[0]; - block1[ 2] = append3_t[1]; - block1[ 3] = append3_t[2]; - block1[ 4] = append3_t[3]; - - block1[ 5] = append4_t[0]; - block1[ 6] = append4_t[1]; - block1[ 7] = append4_t[2]; - block1[ 8] = append4_t[3]; - break; - - case 6: block0[ 6] |= append0_t[0]; - block0[ 7] = append0_t[1]; - block0[ 8] = append0_t[2]; - block0[ 9] = append0_t[3]; - - block0[10] = append1_t[0]; - block0[11] = append1_t[1]; - block0[12] = append1_t[2]; - block0[13] = append1_t[3]; - - block0[14] = append2_t[0]; - block0[15] = append2_t[1]; - block1[ 0] = append2_t[2]; - block1[ 1] = append2_t[3]; - - block1[ 2] = append3_t[0]; - block1[ 3] = append3_t[1]; - block1[ 4] = append3_t[2]; - block1[ 5] = append3_t[3]; - - block1[ 6] = append4_t[0]; - block1[ 7] = append4_t[1]; - block1[ 8] = append4_t[2]; - block1[ 9] = append4_t[3]; - break; - - case 7: block0[ 7] |= append0_t[0]; - block0[ 8] = append0_t[1]; - block0[ 9] = append0_t[2]; - block0[10] = append0_t[3]; - - block0[11] = append1_t[0]; - block0[12] = append1_t[1]; - block0[13] = append1_t[2]; - block0[14] = append1_t[3]; - - block0[15] = append2_t[0]; - block1[ 0] = append2_t[1]; - block1[ 1] = append2_t[2]; - block1[ 2] = append2_t[3]; - - block1[ 3] = append3_t[0]; - block1[ 4] = append3_t[1]; - block1[ 5] = append3_t[2]; - block1[ 6] = append3_t[3]; - - block1[ 7] = append4_t[0]; - block1[ 8] = append4_t[1]; - block1[ 9] = append4_t[2]; - block1[10] = append4_t[3]; - break; - - case 8: block0[ 8] |= append0_t[0]; - block0[ 9] = append0_t[1]; - block0[10] = append0_t[2]; - block0[11] = append0_t[3]; - - block0[12] = append1_t[0]; - block0[13] = append1_t[1]; - block0[14] = append1_t[2]; - block0[15] = append1_t[3]; - - block1[ 0] = append2_t[0]; - block1[ 1] = append2_t[1]; - block1[ 2] = append2_t[2]; - block1[ 3] = append2_t[3]; - - block1[ 4] = append3_t[0]; - block1[ 5] = append3_t[1]; - block1[ 6] = append3_t[2]; - block1[ 7] = append3_t[3]; - - block1[ 8] = append4_t[0]; - block1[ 9] = append4_t[1]; - block1[10] = append4_t[2]; - block1[11] = append4_t[3]; - break; - - case 9: block0[ 9] |= append0_t[0]; - block0[10] = append0_t[1]; - block0[11] = append0_t[2]; - block0[12] = append0_t[3]; - - block0[13] = append1_t[0]; - block0[14] = append1_t[1]; - block0[15] = append1_t[2]; - block1[ 0] = append1_t[3]; - - block1[ 1] = append2_t[0]; - block1[ 2] = append2_t[1]; - block1[ 3] = append2_t[2]; - block1[ 4] = append2_t[3]; - - block1[ 5] = append3_t[0]; - block1[ 6] = append3_t[1]; - block1[ 7] = append3_t[2]; - block1[ 8] = append3_t[3]; - - block1[ 9] = append4_t[0]; - block1[10] = append4_t[1]; - block1[11] = append4_t[2]; - block1[12] = append4_t[3]; - break; - - case 10: block0[10] |= append0_t[0]; - block0[11] = append0_t[1]; - block0[12] = append0_t[2]; - block0[13] = append0_t[3]; - - block0[14] = append1_t[0]; - block0[15] = append1_t[1]; - block1[ 0] = append1_t[2]; - block1[ 1] = append1_t[3]; - - block1[ 2] = append2_t[0]; - block1[ 3] = append2_t[1]; - block1[ 4] = append2_t[2]; - block1[ 5] = append2_t[3]; - - block1[ 6] = append3_t[0]; - block1[ 7] = append3_t[1]; - block1[ 8] = append3_t[2]; - block1[ 9] = append3_t[3]; - - block1[10] = append4_t[0]; - block1[11] = append4_t[1]; - block1[12] = append4_t[2]; - block1[13] = append4_t[3]; - break; - - case 11: block0[11] |= append0_t[0]; - block0[12] = append0_t[1]; - block0[13] = append0_t[2]; - block0[14] = append0_t[3]; - - block0[15] = append1_t[0]; - block1[ 0] = append1_t[1]; - block1[ 1] = append1_t[2]; - block1[ 2] = append1_t[3]; - - block1[ 3] = append2_t[0]; - block1[ 4] = append2_t[1]; - block1[ 5] = append2_t[2]; - block1[ 6] = append2_t[3]; - - block1[ 7] = append3_t[0]; - block1[ 8] = append3_t[1]; - block1[ 9] = append3_t[2]; - block1[10] = append3_t[3]; - - block1[11] = append4_t[0]; - block1[12] = append4_t[1]; - block1[13] = append4_t[2]; - block1[14] = append4_t[3]; - break; - - case 12: block0[12] |= append0_t[0]; - block0[13] = append0_t[1]; - block0[14] = append0_t[2]; - block0[15] = append0_t[3]; - - block1[ 0] = append1_t[0]; - block1[ 1] = append1_t[1]; - block1[ 2] = append1_t[2]; - block1[ 3] = append1_t[3]; - - block1[ 4] = append2_t[0]; - block1[ 5] = append2_t[1]; - block1[ 6] = append2_t[2]; - block1[ 7] = append2_t[3]; - - block1[ 8] = append3_t[0]; - block1[ 9] = append3_t[1]; - block1[10] = append3_t[2]; - block1[11] = append3_t[3]; - - block1[12] = append4_t[0]; - block1[13] = append4_t[1]; - block1[14] = append4_t[2]; - block1[15] = append4_t[3]; - break; - - case 13: block0[13] |= append0_t[0]; - block0[14] = append0_t[1]; - block0[15] = append0_t[2]; - block1[ 0] = append0_t[3]; - - block1[ 1] = append1_t[0]; - block1[ 2] = append1_t[1]; - block1[ 3] = append1_t[2]; - block1[ 4] = append1_t[3]; - - block1[ 5] = append2_t[0]; - block1[ 6] = append2_t[1]; - block1[ 7] = append2_t[2]; - block1[ 8] = append2_t[3]; - - block1[ 9] = append3_t[0]; - block1[10] = append3_t[1]; - block1[11] = append3_t[2]; - block1[12] = append3_t[3]; - - block1[13] = append4_t[0]; - block1[14] = append4_t[1]; - block1[15] = append4_t[2]; - break; - - case 14: block0[14] |= append0_t[0]; - block0[15] = append0_t[1]; - block1[ 0] = append0_t[2]; - block1[ 1] = append0_t[3]; - - block1[ 2] = append1_t[0]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - - block1[ 6] = append2_t[0]; - block1[ 7] = append2_t[1]; - block1[ 8] = append2_t[2]; - block1[ 9] = append2_t[3]; - - block1[10] = append3_t[0]; - block1[11] = append3_t[1]; - block1[12] = append3_t[2]; - block1[13] = append3_t[3]; - - block1[14] = append4_t[0]; - block1[15] = append4_t[1]; - break; - - case 15: block0[15] |= append0_t[0]; - block1[ 0] = append0_t[1]; - block1[ 1] = append0_t[2]; - block1[ 2] = append0_t[3]; - - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append1_t[0]; - - block1[ 7] = append2_t[0]; - block1[ 8] = append2_t[1]; - block1[ 9] = append2_t[2]; - block1[10] = append2_t[3]; - - block1[11] = append3_t[0]; - block1[12] = append3_t[1]; - block1[13] = append3_t[2]; - block1[14] = append3_t[3]; - - block1[15] = append4_t[0]; - break; - - case 16: block1[ 0] |= append0_t[0]; - block1[ 1] = append0_t[1]; - block1[ 2] = append0_t[2]; - block1[ 3] = append0_t[3]; - - block1[ 4] = append1_t[0]; - block1[ 5] = append1_t[1]; - block1[ 6] = append1_t[2]; - block1[ 7] = append1_t[3]; - - block1[ 8] = append2_t[0]; - block1[ 9] = append2_t[1]; - block1[10] = append2_t[2]; - block1[11] = append2_t[3]; - - block1[12] = append3_t[0]; - block1[13] = append3_t[1]; - block1[14] = append3_t[2]; - block1[15] = append3_t[3]; - break; - - case 17: block1[ 1] |= append0_t[0]; - block1[ 2] = append0_t[1]; - block1[ 3] = append0_t[2]; - block1[ 4] = append0_t[3]; - - block1[ 5] = append1_t[0]; - block1[ 6] = append1_t[1]; - block1[ 7] = append1_t[2]; - block1[ 8] = append1_t[3]; - - block1[ 9] = append2_t[0]; - block1[10] = append2_t[1]; - block1[11] = append2_t[2]; - block1[12] = append2_t[3]; - - block1[13] = append3_t[0]; - block1[14] = append3_t[1]; - block1[15] = append3_t[2]; - break; - - case 18: block1[ 2] |= append0_t[0]; - block1[ 3] = append0_t[1]; - block1[ 4] = append0_t[2]; - block1[ 5] = append0_t[3]; - - block1[ 6] = append1_t[0]; - block1[ 7] = append1_t[1]; - block1[ 8] = append1_t[2]; - block1[ 9] = append1_t[3]; - - block1[10] = append2_t[0]; - block1[11] = append2_t[1]; - block1[12] = append2_t[2]; - block1[13] = append2_t[3]; - - block1[14] = append3_t[0]; - block1[15] = append3_t[1]; - break; - - case 19: block1[ 3] |= append0_t[0]; - block1[ 4] = append0_t[1]; - block1[ 5] = append0_t[2]; - block1[ 6] = append0_t[3]; - - block1[ 7] = append1_t[0]; - block1[ 8] = append1_t[1]; - block1[ 9] = append1_t[2]; - block1[10] = append1_t[3]; - - block1[11] = append2_t[0]; - block1[12] = append2_t[1]; - block1[13] = append2_t[2]; - block1[14] = append2_t[3]; - - block1[15] = append3_t[0]; - break; - - case 20: block1[ 4] |= append0_t[0]; - block1[ 5] = append0_t[1]; - block1[ 6] = append0_t[2]; - block1[ 7] = append0_t[3]; - - block1[ 8] = append1_t[0]; - block1[ 9] = append1_t[1]; - block1[10] = append1_t[2]; - block1[11] = append1_t[3]; - - block1[12] = append2_t[0]; - block1[13] = append2_t[1]; - block1[14] = append2_t[2]; - block1[15] = append2_t[3]; - break; - - case 21: block1[ 5] |= append0_t[0]; - block1[ 6] = append0_t[1]; - block1[ 7] = append0_t[2]; - block1[ 8] = append0_t[3]; - - block1[ 9] = append1_t[0]; - block1[10] = append1_t[1]; - block1[11] = append1_t[2]; - block1[12] = append1_t[3]; - - block1[13] = append2_t[0]; - block1[14] = append2_t[1]; - block1[15] = append2_t[2]; - break; - - case 22: block1[ 6] |= append0_t[0]; - block1[ 7] = append0_t[1]; - block1[ 8] = append0_t[2]; - block1[ 9] = append0_t[3]; - - block1[10] = append1_t[0]; - block1[11] = append1_t[1]; - block1[12] = append1_t[2]; - block1[13] = append1_t[3]; - - block1[14] = append2_t[0]; - block1[15] = append2_t[1]; - break; - - case 23: block1[ 7] |= append0_t[0]; - block1[ 8] = append0_t[1]; - block1[ 9] = append0_t[2]; - block1[10] = append0_t[3]; - - block1[11] = append1_t[0]; - block1[12] = append1_t[1]; - block1[13] = append1_t[2]; - block1[14] = append1_t[3]; - - block1[15] = append2_t[0]; - break; - - case 24: block1[ 8] |= append0_t[0]; - block1[ 9] = append0_t[1]; - block1[10] = append0_t[2]; - block1[11] = append0_t[3]; - - block1[12] = append1_t[0]; - block1[13] = append1_t[1]; - block1[14] = append1_t[2]; - block1[15] = append1_t[3]; - break; - - case 25: block1[ 9] |= append0_t[0]; - block1[10] = append0_t[1]; - block1[11] = append0_t[2]; - block1[12] = append0_t[3]; - - block1[13] = append1_t[0]; - block1[14] = append1_t[1]; - block1[15] = append1_t[2]; - break; - - case 26: block1[10] |= append0_t[0]; - block1[11] = append0_t[1]; - block1[12] = append0_t[2]; - block1[13] = append0_t[3]; - - block1[14] = append1_t[0]; - block1[15] = append1_t[1]; - break; - - case 27: block1[11] |= append0_t[0]; - block1[12] = append0_t[1]; - block1[13] = append0_t[2]; - block1[14] = append0_t[3]; - - block1[15] = append1_t[0]; - break; - - case 28: block1[12] |= append0_t[0]; - block1[13] = append0_t[1]; - block1[14] = append0_t[2]; - block1[15] = append0_t[3]; - break; - - case 29: block1[13] |= append0_t[0]; - block1[14] = append0_t[1]; - block1[15] = append0_t[2]; - break; - - case 30: block1[14] |= append0_t[0]; - block1[15] = append0_t[1]; - break; - } - - u32 new_len = block_len + append_len; - - return new_len; -} - -void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -__kernel void m11400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400m_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400m_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400m_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400m_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400m_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400m_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400m_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400m_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - - u32 w3[4]; - - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400m_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400m_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400m_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400m_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400s_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400s_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400s_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400s_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400s_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400s_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400s_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400s_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - - u32 w3[4]; - - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400s_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400s_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400s_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400s_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} diff --git a/OpenCL/m11400_a3.cl b/OpenCL/m11400_a3.cl new file mode 100644 index 000000000..cace5a474 --- /dev/null +++ b/OpenCL/m11400_a3.cl @@ -0,0 +1,286 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; + + const u32 esalt_lenv = ceil ((float) esalt_len / 4); + + u32x esalt_buf[64] = { 0 }; + + for (int idx = 0; idx < esalt_lenv; idx++) + { + esalt_buf[idx] = esalt_bufs[digests_offset].esalt_buf[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx1; + + md5_init_vector_from_scalar (&ctx1, &ctx0); + + md5_update_vector (&ctx1, w, pw_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_vector (&ctx, esalt_buf, esalt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; + + const u32 esalt_lenv = ceil ((float) esalt_len / 4); + + u32x esalt_buf[64] = { 0 }; + + for (int idx = 0; idx < esalt_lenv; idx++) + { + esalt_buf[idx] = esalt_bufs[digests_offset].esalt_buf[idx]; + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx1; + + md5_init_vector_from_scalar (&ctx1, &ctx0); + + md5_update_vector (&ctx1, w, pw_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_vector (&ctx, esalt_buf, esalt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11600.cl b/OpenCL/m11600.cl index 44676b478..34b00d893 100644 --- a/OpenCL/m11600.cl +++ b/OpenCL/m11600.cl @@ -177,8 +177,6 @@ __kernel void m11600_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m12400.cl b/OpenCL/m12400.cl index 7ac2b0f0f..1f56c3b4c 100644 --- a/OpenCL/m12400.cl +++ b/OpenCL/m12400.cl @@ -544,8 +544,6 @@ __kernel void m12400_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } u32 tt; diff --git a/OpenCL/m12600_a0.cl b/OpenCL/m12600_a0.cl new file mode 100644 index 000000000..22ad327fb --- /dev/null +++ b/OpenCL/m12600_a0.cl @@ -0,0 +1,328 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" +#include "inc_hash_sha256.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + + +__kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m12600_a1.cl b/OpenCL/m12600_a1.cl new file mode 100644 index 000000000..d20b3ae90 --- /dev/null +++ b/OpenCL/m12600_a1.cl @@ -0,0 +1,307 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" +#include "inc_hash_sha256.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m12600_a3.cl b/OpenCL/m12600_a3.cl new file mode 100644 index 000000000..75dde746a --- /dev/null +++ b/OpenCL/m12600_a3.cl @@ -0,0 +1,337 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" +#include "inc_hash_sha256.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha256_ctx_vector_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final_vector (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha256_ctx_vector_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final_vector (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13100_a0.cl b/OpenCL/m13100_a0.cl new file mode 100644 index 000000000..a3672cff3 --- /dev/null +++ b/OpenCL/m13100_a0.cl @@ -0,0 +1,512 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], __global const u32 *edata2, const u32 edata2_len, const u32 K2[4], const u32 checksum[4]) +{ + rc4_init_16 (rc4_key, data); + + u32 out0[4]; + u32 out1[4]; + + u8 i = 0; + u8 j = 0; + + /* + 8 first bytes are nonce, then ASN1 structs (DER encoding: type-length-data) + + if length >= 128 bytes: + length is on 2 bytes and type is \x63\x82 (encode_krb5_enc_tkt_part) and data is an ASN1 sequence \x30\x82 + else: + length is on 1 byte and type is \x63\x81 and data is an ASN1 sequence \x30\x81 + + next headers follow the same ASN1 "type-length-data" scheme + */ + + j = rc4_next_16 (rc4_key, i, j, edata2 + 0, out0); i += 16; + + if (((out0[2] & 0xff00ffff) != 0x30008163) && ((out0[2] & 0x0000ffff) != 0x00008263)) return 0; + + j = rc4_next_16 (rc4_key, i, j, edata2 + 4, out1); i += 16; + + if (((out1[0] & 0x00ffffff) != 0x00000503) && (out1[0] != 0x050307A0)) return 0; + + rc4_init_16 (rc4_key, data); + + i = 0; + j = 0; + + // init hmac + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K2[0]; + w0[1] = K2[1]; + w0[2] = K2[2]; + w0[3] = K2[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + int edata2_left; + + for (edata2_left = edata2_len; edata2_left >= 64; edata2_left -= 64) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } + + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + if (edata2_left < 16) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + + truncate_block_4x4_le (w0, edata2_left & 0xf); + } + else if (edata2_left < 32) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + + truncate_block_4x4_le (w1, edata2_left & 0xf); + } + else if (edata2_left < 48) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + + truncate_block_4x4_le (w2, edata2_left & 0xf); + } + else + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + truncate_block_4x4_le (w3, edata2_left & 0xf); + } + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, edata2_left); + + md5_hmac_final (&ctx); + + if (checksum[0] != ctx.opad.h[0]) return 0; + if (checksum[1] != ctx.opad.h[1]) return 0; + if (checksum[2] != ctx.opad.h[2]) return 0; + if (checksum[3] != ctx.opad.h[3]) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4], u32 K2[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 2; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; + + K2[0] = ctx1.opad.h[0]; + K2[1] = ctx1.opad.h[1]; + K2[2] = ctx1.opad.h[2]; + K2[3] = ctx1.opad.h[3]; +} + +__kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m13100_a1.cl b/OpenCL/m13100_a1.cl new file mode 100644 index 000000000..38b1ed643 --- /dev/null +++ b/OpenCL/m13100_a1.cl @@ -0,0 +1,492 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], __global const u32 *edata2, const u32 edata2_len, const u32 K2[4], const u32 checksum[4]) +{ + rc4_init_16 (rc4_key, data); + + u32 out0[4]; + u32 out1[4]; + + u8 i = 0; + u8 j = 0; + + /* + 8 first bytes are nonce, then ASN1 structs (DER encoding: type-length-data) + + if length >= 128 bytes: + length is on 2 bytes and type is \x63\x82 (encode_krb5_enc_tkt_part) and data is an ASN1 sequence \x30\x82 + else: + length is on 1 byte and type is \x63\x81 and data is an ASN1 sequence \x30\x81 + + next headers follow the same ASN1 "type-length-data" scheme + */ + + j = rc4_next_16 (rc4_key, i, j, edata2 + 0, out0); i += 16; + + if (((out0[2] & 0xff00ffff) != 0x30008163) && ((out0[2] & 0x0000ffff) != 0x00008263)) return 0; + + j = rc4_next_16 (rc4_key, i, j, edata2 + 4, out1); i += 16; + + if (((out1[0] & 0x00ffffff) != 0x00000503) && (out1[0] != 0x050307A0)) return 0; + + rc4_init_16 (rc4_key, data); + + i = 0; + j = 0; + + // init hmac + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K2[0]; + w0[1] = K2[1]; + w0[2] = K2[2]; + w0[3] = K2[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + int edata2_left; + + for (edata2_left = edata2_len; edata2_left >= 64; edata2_left -= 64) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } + + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + if (edata2_left < 16) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + + truncate_block_4x4_le (w0, edata2_left & 0xf); + } + else if (edata2_left < 32) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + + truncate_block_4x4_le (w1, edata2_left & 0xf); + } + else if (edata2_left < 48) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + + truncate_block_4x4_le (w2, edata2_left & 0xf); + } + else + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + truncate_block_4x4_le (w3, edata2_left & 0xf); + } + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, edata2_left); + + md5_hmac_final (&ctx); + + if (checksum[0] != ctx.opad.h[0]) return 0; + if (checksum[1] != ctx.opad.h[1]) return 0; + if (checksum[2] != ctx.opad.h[2]) return 0; + if (checksum[3] != ctx.opad.h[3]) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4], u32 K2[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 2; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; + + K2[0] = ctx1.opad.h[0]; + K2[1] = ctx1.opad.h[1]; + K2[2] = ctx1.opad.h[2]; + K2[3] = ctx1.opad.h[3]; +} + +__kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m13100_a3.cl b/OpenCL/m13100_a3.cl new file mode 100644 index 000000000..3b7b8b742 --- /dev/null +++ b/OpenCL/m13100_a3.cl @@ -0,0 +1,522 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], __global const u32 *edata2, const u32 edata2_len, const u32 K2[4], const u32 checksum[4]) +{ + rc4_init_16 (rc4_key, data); + + u32 out0[4]; + u32 out1[4]; + + u8 i = 0; + u8 j = 0; + + /* + 8 first bytes are nonce, then ASN1 structs (DER encoding: type-length-data) + + if length >= 128 bytes: + length is on 2 bytes and type is \x63\x82 (encode_krb5_enc_tkt_part) and data is an ASN1 sequence \x30\x82 + else: + length is on 1 byte and type is \x63\x81 and data is an ASN1 sequence \x30\x81 + + next headers follow the same ASN1 "type-length-data" scheme + */ + + j = rc4_next_16 (rc4_key, i, j, edata2 + 0, out0); i += 16; + + if (((out0[2] & 0xff00ffff) != 0x30008163) && ((out0[2] & 0x0000ffff) != 0x00008263)) return 0; + + j = rc4_next_16 (rc4_key, i, j, edata2 + 4, out1); i += 16; + + if (((out1[0] & 0x00ffffff) != 0x00000503) && (out1[0] != 0x050307A0)) return 0; + + rc4_init_16 (rc4_key, data); + + i = 0; + j = 0; + + // init hmac + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K2[0]; + w0[1] = K2[1]; + w0[2] = K2[2]; + w0[3] = K2[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + int edata2_left; + + for (edata2_left = edata2_len; edata2_left >= 64; edata2_left -= 64) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } + + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + if (edata2_left < 16) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + + truncate_block_4x4_le (w0, edata2_left & 0xf); + } + else if (edata2_left < 32) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + + truncate_block_4x4_le (w1, edata2_left & 0xf); + } + else if (edata2_left < 48) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + + truncate_block_4x4_le (w2, edata2_left & 0xf); + } + else + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + truncate_block_4x4_le (w3, edata2_left & 0xf); + } + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, edata2_left); + + md5_hmac_final (&ctx); + + if (checksum[0] != ctx.opad.h[0]) return 0; + if (checksum[1] != ctx.opad.h[1]) return 0; + if (checksum[2] != ctx.opad.h[2]) return 0; + if (checksum[3] != ctx.opad.h[3]) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4], u32 K2[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 2; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; + + K2[0] = ctx1.opad.h[0]; + K2[1] = ctx1.opad.h[1]; + K2[2] = ctx1.opad.h[2]; + K2[3] = ctx1.opad.h[3]; +} + +__kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m13300_a0.cl b/OpenCL/m13300_a0.cl new file mode 100644 index 000000000..a8e85f335 --- /dev/null +++ b/OpenCL/m13300_a0.cl @@ -0,0 +1,134 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + sha1_update_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + sha1_update_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13300_a1.cl b/OpenCL/m13300_a1.cl new file mode 100644 index 000000000..c17774d88 --- /dev/null +++ b/OpenCL/m13300_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13300_a3.cl b/OpenCL/m13300_a3.cl new file mode 100644 index 000000000..2ba67e412 --- /dev/null +++ b/OpenCL/m13300_a3.cl @@ -0,0 +1,144 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + sha1_update_vector (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + ctx.h[4] = 0; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + sha1_update_vector (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + ctx.h[4] = 0; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13500_a0.cl b/OpenCL/m13500_a0.cl new file mode 100644 index 000000000..6e8befd86 --- /dev/null +++ b/OpenCL/m13500_a0.cl @@ -0,0 +1,192 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_utf16le_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_utf16le_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13500_a1.cl b/OpenCL/m13500_a1.cl new file mode 100644 index 000000000..5cb45fda6 --- /dev/null +++ b/OpenCL/m13500_a1.cl @@ -0,0 +1,168 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + sha1_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + sha1_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13500_a3.cl b/OpenCL/m13500_a3.cl new file mode 100644 index 000000000..18828963c --- /dev/null +++ b/OpenCL/m13500_a3.cl @@ -0,0 +1,206 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_utf16beN (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m13500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_utf16beN (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13800_a0-optimized.cl b/OpenCL/m13800_a0-optimized.cl index 03e63e556..c44bc74af 100644 --- a/OpenCL/m13800_a0-optimized.cl +++ b/OpenCL/m13800_a0-optimized.cl @@ -146,6 +146,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -168,26 +188,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; diff --git a/OpenCL/m13800_a0.cl b/OpenCL/m13800_a0.cl new file mode 100644 index 000000000..f6dd8b14c --- /dev/null +++ b/OpenCL/m13800_a0.cl @@ -0,0 +1,134 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha256.cl" + +__kernel void m13800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha256_ctx_t ctx; + + sha256_init (&ctx); + + sha256_update_utf16le_swap (&ctx, w, pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha256_ctx_t ctx; + + sha256_init (&ctx); + + sha256_update_utf16le_swap (&ctx, w, pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13800_a1-optimized.cl b/OpenCL/m13800_a1-optimized.cl index 2e557fae5..f6315b732 100644 --- a/OpenCL/m13800_a1-optimized.cl +++ b/OpenCL/m13800_a1-optimized.cl @@ -144,6 +144,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -166,26 +186,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; diff --git a/OpenCL/m13800_a1.cl b/OpenCL/m13800_a1.cl new file mode 100644 index 000000000..6d618ad0b --- /dev/null +++ b/OpenCL/m13800_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha256.cl" + +__kernel void m13800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha256_ctx_t ctx0; + + sha256_init (&ctx0); + + sha256_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha256_ctx_t ctx = ctx0; + + sha256_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha256_ctx_t ctx0; + + sha256_init (&ctx0); + + sha256_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha256_ctx_t ctx = ctx0; + + sha256_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13800_a3-optimized.cl b/OpenCL/m13800_a3-optimized.cl index 85818a31e..618f7a130 100644 --- a/OpenCL/m13800_a3-optimized.cl +++ b/OpenCL/m13800_a3-optimized.cl @@ -143,6 +143,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -165,26 +185,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; diff --git a/OpenCL/m13800_a3.cl b/OpenCL/m13800_a3.cl new file mode 100644 index 000000000..fcf68e292 --- /dev/null +++ b/OpenCL/m13800_a3.cl @@ -0,0 +1,166 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha256.cl" + +__kernel void m13800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = 128; + + const u32 salt_lenv = 32; + + u32x s[32]; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = esalt_bufs[digests_offset].salt_buf[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha256_ctx_vector_t ctx; + + sha256_init_vector (&ctx); + + sha256_update_vector_utf16beN (&ctx, w, pw_len); + + sha256_update_vector (&ctx, s, salt_len); + + sha256_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m13800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = 128; + + const u32 salt_lenv = 32; + + u32x s[32]; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = esalt_bufs[digests_offset].salt_buf[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha256_ctx_vector_t ctx; + + sha256_init_vector (&ctx); + + sha256_update_vector_utf16beN (&ctx, w, pw_len); + + sha256_update_vector (&ctx, s, salt_len); + + sha256_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m14000_a3-optimized.cl b/OpenCL/m14000_a3-optimized.cl index 4a1b81cd3..2a33d1f60 100644 --- a/OpenCL/m14000_a3-optimized.cl +++ b/OpenCL/m14000_a3-optimized.cl @@ -19,7 +19,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL volatile +#define KXX_DECL #endif #ifdef IS_GENERIC @@ -898,11 +898,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -921,469 +921,556 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif @@ -1452,60 +1539,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 KXX_DECL u32 k36, k37, k38, k39, k40, k41; KXX_DECL u32 k42, k43, k44, k45, k46, k47; - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30); - s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17); - s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02); - s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18); - s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62); - s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49); - s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34); - s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50); - s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1599,8 +1632,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - - #endif } void transpose32c (u32 data[32]) diff --git a/OpenCL/m15700.cl b/OpenCL/m15700.cl index 57a33dc1b..8f5de92a9 100644 --- a/OpenCL/m15700.cl +++ b/OpenCL/m15700.cl @@ -138,6 +138,16 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 xd4 = x / 4; const u32 xm4 = x & 3; + __global uint4 *V; + + switch (xm4) + { + case 0: V = V0; break; + case 1: V = V1; break; + case 2: V = V2; break; + case 3: V = V3; break; + } + #ifdef _unroll #pragma unroll #endif @@ -156,13 +166,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ for (u32 y = 0; y < ySIZE; y++) { - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) V0[CO] = X[z]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) V1[CO] = X[z]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) V2[CO] = X[z]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) V3[CO] = X[z]; break; - } + for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } @@ -175,13 +179,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 km = k - (y * SCRYPT_TMTO); - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) T[z] = V0[CO]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) T[z] = V1[CO]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) T[z] = V2[CO]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) T[z] = V3[CO]; break; - } + for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; for (u32 i = 0; i < km; i++) salsa_r (T); @@ -398,8 +396,6 @@ __kernel void m15700_init (__global pw_t *pws, __global const kernel_rule_t *rul const uint4 tmp0 = (uint4) (digest[0], digest[1], digest[2], digest[3]); const uint4 tmp1 = (uint4) (digest[4], digest[5], digest[6], digest[7]); - barrier (CLK_GLOBAL_MEM_FENCE); - tmps[gid].P[k + 0] = tmp0; tmps[gid].P[k + 1] = tmp1; } @@ -464,8 +460,6 @@ __kernel void m15700_comp (__global pw_t *pws, __global const kernel_rule_t *rul for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { - barrier (CLK_GLOBAL_MEM_FENCE); - uint4 tmp; tmp = tmps[gid].P[l + 0]; diff --git a/OpenCL/markov_be.cl b/OpenCL/markov_be.cl index b178259ed..b62775c43 100644 --- a/OpenCL/markov_be.cl +++ b/OpenCL/markov_be.cl @@ -9,7 +9,7 @@ #include "inc_types.cl" -inline void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) +void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) { __global const cs_t *cs = &root_css_buf[pw_r_len]; diff --git a/OpenCL/markov_le.cl b/OpenCL/markov_le.cl index a90fc489c..2d7babff4 100644 --- a/OpenCL/markov_le.cl +++ b/OpenCL/markov_le.cl @@ -9,7 +9,7 @@ #include "inc_types.cl" -inline void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) +void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) { __global const cs_t *cs = &root_css_buf[pw_r_len]; diff --git a/docs/changes.txt b/docs/changes.txt index 6ca3df4a5..407876b44 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -20,6 +20,7 @@ ## - Fixed a parser error for mode -m 9820 = MS Office <= 2003 $3, SHA1 + RC4, collider #2 +- Fixed a problem with changed current working directory, for instance by using --restore together with --remove ## ## Improvements diff --git a/include/interface.h b/include/interface.h index 09c5199e6..ed91ea899 100644 --- a/include/interface.h +++ b/include/interface.h @@ -211,10 +211,10 @@ typedef struct bitcoin_wallet typedef struct sip { - u32 salt_buf[30]; + u32 salt_buf[32]; u32 salt_len; - u32 esalt_buf[38]; + u32 esalt_buf[48]; u32 esalt_len; } sip_t; @@ -1289,24 +1289,24 @@ typedef enum display_len DISPLAY_LEN_MIN_99999 = 1, DISPLAY_LEN_MAX_99999 = 55, - DISPLAY_LEN_MIN_11 = 32 + 1 + 16, - DISPLAY_LEN_MAX_11 = 32 + 1 + 32, - DISPLAY_LEN_MIN_12 = 32 + 1 + 1, + DISPLAY_LEN_MIN_11 = 32 + 1 + 0, + DISPLAY_LEN_MAX_11 = 32 + 1 + SALT_MAX, + DISPLAY_LEN_MIN_12 = 32 + 1 + 0, DISPLAY_LEN_MAX_12 = 32 + 1 + 32, - DISPLAY_LEN_MIN_21 = 32 + 1 + 1, - DISPLAY_LEN_MAX_21 = 32 + 1 + 15, + DISPLAY_LEN_MIN_21 = 32 + 1 + 2, + DISPLAY_LEN_MAX_21 = 32 + 1 + 2, DISPLAY_LEN_MIN_22 = 30 + 1 + 1, - DISPLAY_LEN_MAX_22 = 30 + 1 + 28, + DISPLAY_LEN_MAX_22 = 30 + 1 + 32, DISPLAY_LEN_MIN_23 = 32 + 1 + 0, DISPLAY_LEN_MAX_23 = 32 + 1 + SALT_MAX, DISPLAY_LEN_MIN_101 = 5 + 28, DISPLAY_LEN_MAX_101 = 5 + 28, - DISPLAY_LEN_MIN_111 = 6 + 28 + 0, - DISPLAY_LEN_MAX_111 = 6 + 28 + 40, + DISPLAY_LEN_MIN_111 = 6 + 28 + 1, + DISPLAY_LEN_MAX_111 = 6 + 28 + SALT_MAX, DISPLAY_LEN_MIN_112 = 40 + 1 + 20, DISPLAY_LEN_MAX_112 = 40 + 1 + 20, DISPLAY_LEN_MIN_121 = 40 + 1 + 1, - DISPLAY_LEN_MAX_121 = 40 + 1 + 32, + DISPLAY_LEN_MAX_121 = 40 + 1 + SALT_MAX, DISPLAY_LEN_MIN_122 = 8 + 40, DISPLAY_LEN_MAX_122 = 8 + 40, DISPLAY_LEN_MIN_124 = 4 + 1 + 0 + 1 + 40, @@ -1332,13 +1332,13 @@ typedef enum display_len DISPLAY_LEN_MIN_1731 = 128 + 6 + 0, DISPLAY_LEN_MAX_1731 = 128 + 6 + 16, DISPLAY_LEN_MIN_2611 = 32 + 1 + 0, - DISPLAY_LEN_MAX_2611 = 32 + 1 + 23, - DISPLAY_LEN_MIN_2612 = 6 + 0 + 1 + 32, - DISPLAY_LEN_MAX_2612 = 6 + 46 + 1 + 32, + DISPLAY_LEN_MAX_2611 = 32 + 1 + SALT_MAX, + DISPLAY_LEN_MIN_2612 = 6 + 0 + 1 + 32, + DISPLAY_LEN_MAX_2612 = 6 + SALT_MAX + 1 + 32, DISPLAY_LEN_MIN_2711 = 32 + 1 + 23, DISPLAY_LEN_MAX_2711 = 32 + 1 + 31, DISPLAY_LEN_MIN_2811 = 32 + 1 + 0, - DISPLAY_LEN_MAX_2811 = 32 + 1 + 31, + DISPLAY_LEN_MAX_2811 = 32 + 1 + SALT_MAX, DISPLAY_LEN_MIN_3711 = 3 + 0 + 1 + 32, DISPLAY_LEN_MAX_3711 = 3 + 31 + 1 + 32, DISPLAY_LEN_MIN_4521 = 40 + 1 + 32, diff --git a/include/types.h b/include/types.h index 07d1095ee..36a83a073 100644 --- a/include/types.h +++ b/include/types.h @@ -1008,7 +1008,6 @@ typedef struct hc_device_param char *device_name; char *device_vendor; - char *device_name_chksum; char *device_version; char *driver_version; char *device_opencl_version; @@ -1142,6 +1141,8 @@ typedef struct opencl_ctx bool need_xnvctrl; bool need_sysfs; + int comptime; + int force_jit_compilation; } opencl_ctx_t; diff --git a/src/interface.c b/src/interface.c index 0aaa2f488..6d195b58d 100644 --- a/src/interface.c +++ b/src/interface.c @@ -8004,7 +8004,7 @@ int nsec3_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUS } } - salt->salt_buf_pc[7] = domainbuf_len; + salt->salt_len_pc = domainbuf_len; // "real" salt @@ -17612,7 +17612,7 @@ int ascii_digest (hashcat_ctx_t *hashcat_ctx, char *out_buf, const size_t out_le // domain - const u32 salt_pc_len = salt.salt_buf_pc[7]; // what a hack + const u32 salt_pc_len = salt.salt_len_pc; char domain_buf_c[33] = { 0 }; @@ -24642,9 +24642,9 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) { switch (user_options_extra->attack_kern) { - case ATTACK_KERN_STRAIGHT: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_STRAIGHT: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; - case ATTACK_KERN_COMBI: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_COMBI: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; } } @@ -24654,9 +24654,9 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) { switch (user_options_extra->attack_kern) { - case ATTACK_KERN_STRAIGHT: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_STRAIGHT: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; - case ATTACK_KERN_COMBI: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_COMBI: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; } } @@ -24668,27 +24668,29 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) switch (hashconfig->hash_mode) { - case 500: hashconfig->pw_max = 15; // -L available + case 500: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 1600: hashconfig->pw_max = 15; // -L available + case 1600: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 1800: hashconfig->pw_max = 16; // -L available + case 1800: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // pure kernel available break; - case 5800: hashconfig->pw_max = 16; // -L available + case 5800: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // pure kernel available break; - case 6300: hashconfig->pw_max = 15; // -L available + case 6300: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 7000: hashconfig->pw_max = 19; // todo + case 6900: hashconfig->pw_max = MIN (hashconfig->pw_max, 32); // todo break; - case 7400: hashconfig->pw_max = 15; // -L available + case 7000: hashconfig->pw_max = MIN (hashconfig->pw_max, 19); // pure kernel available break; - case 10700: hashconfig->pw_max = 16; // -L available + case 7400: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 12500: hashconfig->pw_max = 20; // todo + case 10700: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // pure kernel available break; - case 14400: hashconfig->pw_max = 24; // todo + case 12500: hashconfig->pw_max = MIN (hashconfig->pw_max, 20); // todo break; - case 15500: hashconfig->pw_max = 16; // todo + case 14400: hashconfig->pw_max = MIN (hashconfig->pw_max, 24); // todo + break; + case 15500: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // todo break; } } @@ -24702,9 +24704,9 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) } // pw_max : all modes listed in the following switch cases are - // the maximum possible password length by the related system - // plus the opencl kernels support to crack them without -L set by the user - // however, some modes have a self-set and some have + // the maximum possible password length of the related system + // plus the opencl kernels which eventually allows cracking of passwords of up length PW_MAX for free (no speed drop). + // some modes have a self-set and some have // underlaying algorithms specific hard maximum password length // these limits override all previous restrictions, always @@ -24714,7 +24716,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 1500: hashconfig->pw_max = 8; break; // Underlaying DES max case 2100: hashconfig->pw_max = PW_MAX; break; case 2400: hashconfig->pw_max = 16; break; // Cisco-PIX MD5 sets w[4] = 0x80 - case 2410: hashconfig->pw_max = 12; break; // Cisco-ASA MD5 sets w[4] = 0x80 and has a 4 byte fixed salt + case 2410: hashconfig->pw_max = 12; break; // Cisco-ASA MD5 sets w[4] = 0x80 plus has a 4 byte fixed salt case 2500: hashconfig->pw_max = 63; break; // WPA/WPA2 limits itself to 63 by RFC case 2501: hashconfig->pw_max = 64; break; // WPA/WPA2 PMK fixed length case 3000: hashconfig->pw_max = 7; break; // LM max @@ -24743,19 +24745,26 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 7700: hashconfig->pw_max = 8; break; // https://www.daniel-berlin.de/security/sap-sec/password-hash-algorithms/ case 7800: hashconfig->pw_max = 40; break; // https://www.daniel-berlin.de/security/sap-sec/password-hash-algorithms/ case 7900: hashconfig->pw_max = PW_MAX; break; + case 8000: hashconfig->pw_max = 30; break; // http://infocenter.sybase.com/help/index.jsp?topic=/com.sybase.infocenter.dc31654.1570/html/sag1/CIHIBDBA.htm case 8200: hashconfig->pw_max = PW_MAX; break; case 8500: hashconfig->pw_max = 8; break; // Underlaying DES max - case 8600: hashconfig->pw_max = 16; break; // Lotus Notes/Domino 5 limits itself to 8 + case 8600: hashconfig->pw_max = 16; break; // Lotus Notes/Domino 5 limits itself to 16 + case 8700: hashconfig->pw_max = 64; break; // https://www.ibm.com/support/knowledgecenter/en/SSKTWP_8.5.3/com.ibm.notes85.client.doc/fram_limits_of_notes_r.html case 8800: hashconfig->pw_max = PW_MAX; break; case 8900: hashconfig->pw_max = PW_MAX; break; - case 9100: hashconfig->pw_max = 64; break; // Lotus Notes/Domino 8 limits itself to 64 + case 9100: hashconfig->pw_max = 64; break; // https://www.ibm.com/support/knowledgecenter/en/SSKTWP_8.5.3/com.ibm.notes85.client.doc/fram_limits_of_notes_r.html case 9200: hashconfig->pw_max = PW_MAX; break; case 9300: hashconfig->pw_max = PW_MAX; break; case 9400: hashconfig->pw_max = PW_MAX; break; case 9500: hashconfig->pw_max = PW_MAX; break; case 9600: hashconfig->pw_max = PW_MAX; break; + case 9700: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx case 9710: hashconfig->pw_max = 5; break; // Underlaying RC4-40 max + case 9720: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx + case 9800: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx case 9810: hashconfig->pw_max = 5; break; // Underlaying RC4-40 max + case 9820: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx + case 9900: hashconfig->pw_max = 100; break; // RAdmin2 sets w[25] = 0x80 case 10000: hashconfig->pw_max = PW_MAX; break; case 10300: hashconfig->pw_max = 40; break; // https://www.daniel-berlin.de/security/sap-sec/password-hash-algorithms/ case 10400: hashconfig->pw_max = 32; break; // https://www.pdflib.com/knowledge-base/pdf-password-security/encryption/ @@ -25102,6 +25111,8 @@ void hashconfig_benchmark_defaults (hashcat_ctx_t *hashcat_ctx, salt_t *salt, vo break; case 10300: salt->salt_len = 12; break; + case 11000: salt->salt_len = 56; + break; case 11500: salt->salt_len = 4; break; case 11600: salt->salt_len = 4; diff --git a/src/opencl.c b/src/opencl.c index 3fa5d0280..ee9ab76ef 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -402,15 +402,15 @@ void generate_source_kernel_mp_filename (const u32 opti_type, const u64 opts_typ } } -void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum, char *cached_file) +void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file) { if ((opti_type & OPTI_TYPE_BRUTE_FORCE) && (opts_type & OPTS_TYPE_PT_GENERATE_BE)) { - snprintf (cached_file, 255, "%s/kernels/markov_be.%s.kernel", profile_dir, device_name_chksum); + snprintf (cached_file, 255, "%s/kernels/markov_be.%s.kernel", profile_dir, device_name_chksum_amp_mp); } else { - snprintf (cached_file, 255, "%s/kernels/markov_le.%s.kernel", profile_dir, device_name_chksum); + snprintf (cached_file, 255, "%s/kernels/markov_le.%s.kernel", profile_dir, device_name_chksum_amp_mp); } } @@ -419,9 +419,9 @@ void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_di snprintf (source_file, 255, "%s/OpenCL/amp_a%u.cl", shared_dir, attack_kern); } -void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum, char *cached_file) +void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file) { - snprintf (cached_file, 255, "%s/kernels/amp_a%u.%s.kernel", profile_dir, attack_kern, device_name_chksum); + snprintf (cached_file, 255, "%s/kernels/amp_a%u.%s.kernel", profile_dir, attack_kern, device_name_chksum_amp_mp); } int ocl_init (hashcat_ctx_t *hashcat_ctx) @@ -2190,6 +2190,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co uppercase (ptr, line_len); } + /* if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT) { if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) @@ -2202,6 +2203,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co ptr[line_len] = 0x01; } } + */ device_param->combs_buf[i].pw_len = line_len; @@ -3121,27 +3123,6 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->driver_version = driver_version; - // device_name_chksum - - char *device_name_chksum = (char *) hcmalloc (HCBUFSIZ_TINY); - - #if defined (__x86_64__) - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 64, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, comptime, user_options->opencl_vector_width, user_options->hash_mode); - #else - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 32, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, comptime, user_options->opencl_vector_width, user_options->hash_mode); - #endif - - u32 device_name_digest[4] = { 0 }; - - for (size_t i = 0; i < dnclen; i += 64) - { - md5_64 ((u32 *) (device_name_chksum + i), device_name_digest); - } - - snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%08x", device_name_digest[0]); - - device_param->device_name_chksum = device_name_chksum; - // vendor specific if (device_param->device_type & CL_DEVICE_TYPE_GPU) @@ -3436,6 +3417,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) opencl_ctx->need_xnvctrl = need_xnvctrl; opencl_ctx->need_sysfs = need_sysfs; + opencl_ctx->comptime = comptime; + return 0; } @@ -3459,7 +3442,6 @@ void opencl_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->skipped == true) continue; hcfree (device_param->device_name); - hcfree (device_param->device_name_chksum); hcfree (device_param->device_version); hcfree (device_param->driver_version); hcfree (device_param->device_opencl_version); @@ -3810,8 +3792,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) * device properties */ - const char *device_name_chksum = device_param->device_name_chksum; - const u32 device_processors = device_param->device_processors; + const u32 device_processors = device_param->device_processors; /** * create context for each device @@ -4234,9 +4215,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) char build_opts_new[1024] = { 0 }; #if defined (DEBUG) - snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -cl-std=CL1.2", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); + snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); #else - snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -cl-std=CL1.2 -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); + snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); #endif if (device_param->device_type & CL_DEVICE_TYPE_CPU) @@ -4253,6 +4234,39 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_opts '%s'", device_id + 1, build_opts); #endif + /** + * device_name_chksum + */ + + char *device_name_chksum = (char *) hcmalloc (HCBUFSIZ_TINY); + char *device_name_chksum_amp_mp = (char *) hcmalloc (HCBUFSIZ_TINY); + + #if defined (__x86_64__) + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 64, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY - 1, "%d-%u-%s-%s-%s-%d", 64, device_param->platform_vendor_id, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime); + #else + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 32, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY - 1, "%d-%u-%s-%s-%s-%d", 32, device_param->platform_vendor_id, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime); + #endif + + u32 device_name_digest[4] = { 0 }; + + for (size_t i = 0; i < dnclen; i += 64) + { + md5_64 ((u32 *) (device_name_chksum + i), device_name_digest); + } + + snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%08x", device_name_digest[0]); + + u32 device_name_digest_amp_mp[4] = { 0 }; + + for (size_t i = 0; i < dnclen_amp_mp; i += 64) + { + md5_64 ((u32 *) (device_name_chksum_amp_mp + i), device_name_digest_amp_mp); + } + + snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY - 1, "%08x", device_name_digest_amp_mp[0]); + /** * main kernel */ @@ -4482,7 +4496,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) char cached_file[256] = { 0 }; - generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum, cached_file); + generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file); bool cached = true; @@ -4623,7 +4637,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) char cached_file[256] = { 0 }; - generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum, cached_file); + generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file); bool cached = true; @@ -4733,6 +4747,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) hcfree (kernel_sources[0]); } + hcfree (device_name_chksum); + hcfree (device_name_chksum_amp_mp); + // return back to the folder we came from initially (workaround) if (chdir (folder_config->cwd) == -1) diff --git a/src/restore.c b/src/restore.c index 9d576e99e..82a2e14b2 100644 --- a/src/restore.c +++ b/src/restore.c @@ -9,6 +9,8 @@ #include "event.h" #include "user_options.h" #include "shared.h" +#include "pidfile.h" +#include "folder.h" #include "restore.h" #if defined (_WIN) @@ -45,7 +47,8 @@ static int init_restore (hashcat_ctx_t *hashcat_ctx) static int read_restore (hashcat_ctx_t *hashcat_ctx) { - restore_ctx_t *restore_ctx = hashcat_ctx->restore_ctx; + restore_ctx_t *restore_ctx = hashcat_ctx->restore_ctx; + folder_config_t *folder_config = hashcat_ctx->folder_config; if (restore_ctx->enabled == false) return 0; @@ -131,20 +134,56 @@ static int read_restore (hashcat_ctx_t *hashcat_ctx) return -1; } - event_log_warning (hashcat_ctx, "Changing current working directory to '%s'", rd->cwd); - event_log_warning (hashcat_ctx, NULL); - - if (chdir (rd->cwd)) + if (strncmp (rd->cwd, folder_config->cwd, sizeof (rd->cwd)) != 0) // check if we need to change the current working directory { - event_log_error (hashcat_ctx, "Directory '%s' needed to restore the session was not found.", rd->cwd); - - event_log_warning (hashcat_ctx, "Either create the directory, or update the directory within the .restore file."); - event_log_warning (hashcat_ctx, "Restore files can be analyzed and modified with analyze_hc_restore.pl:"); - event_log_warning (hashcat_ctx, " https://github.com/philsmd/analyze_hc_restore"); - event_log_warning (hashcat_ctx, "Directory must contain all files and folders from the original command line."); + event_log_warning (hashcat_ctx, "Changing current working directory to '%s'", rd->cwd); event_log_warning (hashcat_ctx, NULL); - return -1; + if (chdir (rd->cwd)) + { + event_log_error (hashcat_ctx, "Directory '%s' needed to restore the session was not found.", rd->cwd); + + event_log_warning (hashcat_ctx, "Either create the directory, or update the directory within the .restore file."); + event_log_warning (hashcat_ctx, "Restore files can be analyzed and modified with analyze_hc_restore.pl:"); + event_log_warning (hashcat_ctx, " https://github.com/philsmd/analyze_hc_restore"); + event_log_warning (hashcat_ctx, "Directory must contain all files and folders from the original command line."); + event_log_warning (hashcat_ctx, NULL); + + return -1; + } + + // if we are here, we also need to update the folder_config and .pid file: + + /** + * updated folders + */ + + const char *install_folder = NULL; + const char *shared_folder = NULL; + + #if defined (INSTALL_FOLDER) + install_folder = INSTALL_FOLDER; + #endif + + #if defined (SHARED_FOLDER) + shared_folder = SHARED_FOLDER; + #endif + + folder_config_destroy (hashcat_ctx); + + const int rc_folder_config_init = folder_config_init (hashcat_ctx, install_folder, shared_folder); + + if (rc_folder_config_init == -1) return -1; + + /** + * updated pidfile + */ + + pidfile_ctx_destroy (hashcat_ctx); + + const int rc_pidfile_init = pidfile_ctx_init (hashcat_ctx); + + if (rc_pidfile_init == -1) return -1; } return 0; diff --git a/src/shared.c b/src/shared.c index 73d17646d..0e209edd5 100644 --- a/src/shared.c +++ b/src/shared.c @@ -334,32 +334,8 @@ void setup_environment_variables () putenv ((char *) "DISPLAY=:0"); } - if (getenv ("GPU_FORCE_64BIT_PTR") == NULL) - putenv ((char *) "GPU_FORCE_64BIT_PTR=1"); - - if (getenv ("GPU_MAX_ALLOC_PERCENT") == NULL) - putenv ((char *) "GPU_MAX_ALLOC_PERCENT=100"); - - if (getenv ("GPU_SINGLE_ALLOC_PERCENT") == NULL) - putenv ((char *) "GPU_SINGLE_ALLOC_PERCENT=100"); - - if (getenv ("GPU_MAX_HEAP_SIZE") == NULL) - putenv ((char *) "GPU_MAX_HEAP_SIZE=100"); - - if (getenv ("CPU_FORCE_64BIT_PTR") == NULL) - putenv ((char *) "CPU_FORCE_64BIT_PTR=1"); - - if (getenv ("CPU_MAX_ALLOC_PERCENT") == NULL) - putenv ((char *) "CPU_MAX_ALLOC_PERCENT=100"); - - if (getenv ("CPU_SINGLE_ALLOC_PERCENT") == NULL) - putenv ((char *) "CPU_SINGLE_ALLOC_PERCENT=100"); - - if (getenv ("CPU_MAX_HEAP_SIZE") == NULL) - putenv ((char *) "CPU_MAX_HEAP_SIZE=100"); - - if (getenv ("GPU_USE_SYNC_OBJECTS") == NULL) - putenv ((char *) "GPU_USE_SYNC_OBJECTS=1"); + if (getenv ("OCL_CODE_CACHE_ENABLE") == NULL) + putenv ((char *) "OCL_CODE_CACHE_ENABLE=0"); if (getenv ("CUDA_CACHE_DISABLE") == NULL) putenv ((char *) "CUDA_CACHE_DISABLE=1"); @@ -397,27 +373,13 @@ u32 get_random_num (const u32 min, const u32 max) if (low == 0) return (0); - #if defined (__linux__) + #if defined (_WIN) - u32 data; - - FILE *fp = fopen ("/dev/urandom", "rb"); - - if (fp == NULL) return (0); - - const int nread = fread (&data, sizeof (u32), 1, fp); - - fclose (fp); - - if (nread != 1) return 0; - - u64 r = data % low; r += min; - - return (u32) r; + return (((u32) rand () % (max - min)) + min); #else - return (((u32) rand () % (max - min)) + min); + return (((u32) random () % (max - min)) + min); #endif } diff --git a/tools/test.pl b/tools/test.pl index 5d65ee760..14d44fe26 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -3545,7 +3545,7 @@ sub passthrough $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, $salt_len)); } - elsif ($mode == 9400 || $mode == 9500 || $mode == 9600 || $mode == 9700 || $mode == 9800) + elsif ($mode == 9400 || $mode == 9500 || $mode == 9600) { next if length ($word_buf) > 19; @@ -3553,6 +3553,14 @@ sub passthrough $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, $salt_len)); } + elsif ($mode == 9700 || $mode == 9800) + { + next if length ($word_buf) > 15; + + my $salt_len = 32; + + $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, $salt_len)); + } elsif ($mode == 10100) { $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, 32)); @@ -4326,7 +4334,7 @@ sub single } } } - elsif ($mode == 9400 || $mode == 9500 || $mode == 9600 || $mode == 9700 || $mode == 9800) + elsif ($mode == 9400 || $mode == 9500 || $mode == 9600) { my $salt_len = 32; @@ -4342,6 +4350,22 @@ sub single } } } + elsif ($mode == 9700 || $mode == 9800) + { + my $salt_len = 32; + + for (my $i = 1; $i < 16; $i++) + { + if ($len != 0) + { + rnd ($mode, $len, $salt_len); + } + else + { + rnd ($mode, $i, $salt_len); + } + } + } elsif ($mode == 10100) { for (my $i = 1; $i < 32; $i++) diff --git a/tools/test.sh b/tools/test.sh index 92cdc11fa..df97d144b 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -24,7 +24,7 @@ NEVER_CRACK="11600 14900" SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12001 12100 12200 12300 12400 12500 12700 12800 12900 13000 13200 13400 13600 14600 14700 14800 15100 15200 15300 15600 15700 15800" -OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0 -d 1" +OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0" OUTD="test_$(date +%s)" @@ -237,30 +237,36 @@ function init() rm -rf ${OUTD}/${hash_type}_dict1 ${OUTD}/${hash_type}_dict2 touch ${OUTD}/${hash_type}_dict1 ${OUTD}/${hash_type}_dict2 - # foreach password entry split password in 2 (skip first entry, is len 1) - i=1 - # minimum password length - min_len=0 + min=1 # minimum line number from start of the file + min_offset=0 # minimum offset starting from ${min} lines if [ "${hash_type}" -eq 2500 ]; then - min_len=7 # means length 8, since we start with 0 + min_offset=7 # means length 8, since we start with 0 elif [ "${hash_type}" -eq 14000 ]; then - min_len=7 + min=0 + min_offset=4 elif [ "${hash_type}" -eq 14100 ]; then - min_len=23 + min=0 + min_offset=3 elif [ "${hash_type}" -eq 14900 ]; then - min_len=9 + min=0 + min_offset=5 elif [ "${hash_type}" -eq 15400 ]; then - min_len=31 + min=0 + min_offset=3 elif [ "${hash_type}" -eq 15800 ]; then min_len=7 fi + # foreach password entry split password in 2 (skip first entry, is len 1) + + i=1 + while read -u 9 pass; do - if [ ${i} -gt 1 ]; then + if [ ${i} -gt ${min} ]; then # split password, 'i' is the len p0=$((i / 2)) @@ -272,8 +278,8 @@ function init() if [ "${pass_len}" -gt 1 ] then - p1=$((p1 + ${min_len})) - p0=$((p0 + ${min_len})) + p1=$((p1 + ${min_offset})) + p0=$((p0 + ${min_offset})) if [ "${p1}" -gt ${pass_len} ]; then @@ -597,11 +603,23 @@ function attack_1() e_nm=0 cnt=0 + min=1 + + if [ "${hash_type}" -eq 14000 ]; then + min=0 + elif [ "${hash_type}" -eq 14100 ]; then + min=0 + elif [ "${hash_type}" -eq 14900 ]; then + min=0 + elif [ "${hash_type}" -eq 15400 ]; then + min=0 + fi + echo "> Testing hash type $hash_type with attack mode 1, markov ${MARKOV}, single hash, Device-Type ${TYPE}, vector-width ${VECTOR}." &>> ${OUTD}/logfull.txt i=1 while read -u 9 hash; do - if [ $i -gt 1 ]; then + if [ $i -gt ${min} ]; then if [ "${file_only}" -eq 1 ]; then @@ -623,7 +641,11 @@ function attack_1() if [ "${ret}" -eq 0 ]; then - line_nr=$((i - 1)) + line_nr=1 + + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi line_dict1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) line_dict2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) @@ -671,6 +693,18 @@ function attack_1() # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0 @@ -694,14 +728,6 @@ function attack_1() offset=7 elif [ ${hash_type} -eq 8500 ]; then offset=7 - elif [ ${hash_type} -eq 14000 ]; then - offset=7 - elif [ ${hash_type} -eq 14100 ]; then - offset=23 - elif [ ${hash_type} -eq 14900 ]; then - offset=9 - elif [ ${hash_type} -eq 15400 ]; then - offset=31 elif [ ${hash_type} -eq 15800 ]; then offset=7 fi @@ -743,7 +769,11 @@ function attack_1() while read -u 9 hash; do - line_nr=$((offset - i)) + line_nr=1 + + if [ "${offset}" -gt ${i} ]; then + line_nr=$((${offset} - ${i})) + fi line_dict1=$(tail -n ${line_nr} ${OUTD}/${hash_type}_dict1 | head -1) line_dict2=$(tail -n ${line_nr} ${OUTD}/${hash_type}_dict2 | head -1) @@ -818,22 +848,52 @@ function attack_3() mask_offset=7 max=7 elif [ "${hash_type}" -eq 14000 ]; then - mask_offset=7 - max=7 + mask_offset=4 + max=1 elif [ "${hash_type}" -eq 14100 ]; then - mask_offset=23 - max=23 + mask_offset=3 + max=1 elif [ "${hash_type}" -eq 14900 ]; then - mask_offset=9 - max=9 + mask_offset=5 + max=1 elif [ "${hash_type}" -eq 15400 ]; then - mask_offset=31 - max=31 + mask_offset=3 + max=1 elif [ "${hash_type}" -eq 15800 ]; then mask_offset=7 max=7 fi + # special case: we need to split the first line + + if [ "${mask_offset}" -ne 0 ]; then + + pass=$(sed -n 1p ${OUTD}/${hash_type}_passwords.txt) + + pass_part_2=$(echo -n ${pass} | cut -b $((${mask_offset} + 1))-) + + mask_custom="" + + if [ "${hash_type}" -eq 14000 ]; then + + mask_custom="${pass}" + + elif [ "${hash_type}" -eq 14100 ]; then + + mask_custom="${pass}" + + else + + for i in $(seq 1 ${mask_offset}); do + mask_custom="${mask_custom}?d" + done + + mask_custom="${mask_custom}${pass_part_2}" + + fi + + fi + i=1 while read -u 9 hash; do @@ -842,7 +902,7 @@ function attack_3() if ! contains ${hash_type} ${TIMEOUT_ALGOS}; then - break; + break fi @@ -857,12 +917,13 @@ function attack_3() fi mask=${mask_3[$((i + ${mask_offset}))]} + dict="${OUTD}/${hash_type}_passwords.txt" # modify "default" mask if needed (and set custom charset to reduce keyspace) if [ "${hash_type}" -eq 2500 ] || [ "${hash_type}" -eq 15800 ]; then - pass=$(sed -n ${i}p ${OUTD}/${hash_type}_passwords.txt) + pass=$(sed -n ${i}p ${dict}) mask=${pass} @@ -882,6 +943,10 @@ function attack_3() fi + if [ "${mask_offset}" -ne 0 ]; then + mask=${mask_custom} + fi + CMD="./${BIN} ${OPTS} -a 3 -m ${hash_type} '${hash}' ${mask}" echo -n "[ len $i ] " &>> ${OUTD}/logfull.txt @@ -894,7 +959,7 @@ function attack_3() if [ "${ret}" -eq 0 ]; then - line_dict=$(sed -n ${i}p ${OUTD}/${hash_type}_passwords.txt) + line_dict=$(sed -n ${i}p ${dict}) if [ ${pass_only} -eq 1 ]; then search=":${line_dict}" @@ -939,6 +1004,18 @@ function attack_3() # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0 @@ -957,18 +1034,6 @@ function attack_3() if [ "${hash_type}" -eq 2500 ]; then increment_min=8 increment_max=9 - elif [ "${hash_type}" -eq 14000 ]; then - increment_min=8 - increment_max=8 - elif [ "${hash_type}" -eq 14100 ]; then - increment_min=24 - increment_max=24 - elif [ "${hash_type}" -eq 14900 ]; then - increment_min=10 - increment_max=10 - elif [ "${hash_type}" -eq 15400 ]; then - increment_min=32 - increment_max=32 elif [ "${hash_type}" -eq 15800 ]; then increment_min=8 increment_max=9 @@ -1170,37 +1235,84 @@ function attack_6() echo "> Testing hash type $hash_type with attack mode 6, markov ${MARKOV}, single hash, Device-Type ${TYPE}, vector-width ${VECTOR}." &>> ${OUTD}/logfull.txt - i=1 - + min=1 max=8 + mask_offset=0 if [ "${hash_type}" -eq 2500 ]; then max=6 elif [ "${hash_type}" -eq 14000 ]; then - max=6 + min=0 + max=1 + mask_offset=4 elif [ "${hash_type}" -eq 14100 ]; then - max=6 + min=0 + max=1 + mask_offset=21 elif [ "${hash_type}" -eq 14900 ]; then - max=6 + min=0 + max=1 + mask_offset=5 elif [ "${hash_type}" -eq 15400 ]; then - max=6 + min=0 + max=1 + mask_offset=29 elif [ "${hash_type}" -eq 15800 ]; then max=6 fi + # special case: we need to split the first line + + if [ "${min}" -eq 0 ]; then + + pass_part_1=$(sed -n 1p ${OUTD}/${hash_type}_dict1) + pass_part_2=$(sed -n 1p ${OUTD}/${hash_type}_dict2) + + pass="${pass_part_1}${pass_part_2}" + + echo -n ${pass} | cut -b -$((${mask_offset} + 0)) > ${OUTD}/${hash_type}_dict1_custom + echo -n ${pass} | cut -b $((${mask_offset} + 1))- > ${OUTD}/${hash_type}_dict2_custom + + mask_custom="" + + for i in $(seq 1 $((${#pass} - ${mask_offset}))); do + + if [ "${hash_type}" -eq 14000 ]; then + + char=$(echo -n ${pass} | cut -b $((${i} + ${mask_offset}))) + mask_custom="${mask_custom}${char}" + + elif [ "${hash_type}" -eq 14100 ]; then + + char=$(echo -n ${pass} | cut -b $((${i} + ${mask_offset}))) + mask_custom="${mask_custom}${char}" + + else + + mask_custom="${mask_custom}?d" + + fi + + done + + fi + + + i=1 + while read -u 9 hash; do if [ "${i}" -gt 6 ]; then if ! contains ${hash_type} ${TIMEOUT_ALGOS}; then - break; + break fi fi - if [ $i -gt 1 ]; then + if [ ${i} -gt ${min} ]; then if [ "${file_only}" -eq 1 ]; then @@ -1210,11 +1322,23 @@ function attack_6() fi - CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} '${hash}' ${OUTD}/${hash_type}_dict1 ${mask_6[$i]}" + mask=${mask_6[${i}]} + + dict1=${OUTD}/${hash_type}_dict1 + dict2=${OUTD}/${hash_type}_dict2 + + if [ "${min}" -eq 0 ]; then + mask=${mask_custom} + + dict1=${OUTD}/${hash_type}_dict1_custom + dict2=${OUTD}/${hash_type}_dict2_custom + fi + + CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} '${hash}' ${dict1} ${mask}" echo -n "[ len $i ] " &>> ${OUTD}/logfull.txt - output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} "${hash}" ${OUTD}/${hash_type}_dict1 ${mask_6[$i]} 2>&1) + output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} "${hash}" ${dict1} ${mask} 2>&1) ret=${?} @@ -1222,10 +1346,14 @@ function attack_6() if [ "${ret}" -eq 0 ]; then - line_nr=$((i - 1)) + line_nr=1 - line_dict1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) - line_dict2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi + + line_dict1=$(sed -n ${line_nr}p ${dict1}) + line_dict2=$(sed -n ${line_nr}p ${dict2}) if [ ${pass_only} -eq 1 ]; then search=":${line_dict1}${line_dict2}" @@ -1267,11 +1395,26 @@ function attack_6() echo "[ ${OUTD} ] [ Type ${hash_type}, Attack 6, Mode single, Device-Type ${TYPE}, Vector-Width ${VECTOR} ] > $msg : ${e_nf}/${cnt} not found, ${e_nm}/${cnt} not matched, ${e_to}/${cnt} timeout" + rm -f ${OUTD}/${hash_type}_dict1_custom + rm -f ${OUTD}/${hash_type}_dict2_custom + fi # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0 @@ -1287,14 +1430,6 @@ function attack_6() max=8 elif [ ${hash_type} -eq 8500 ]; then max=8 - elif [ ${hash_type} -eq 14000 ]; then - max=5 - elif [ ${hash_type} -eq 14100 ]; then - max=5 - elif [ ${hash_type} -eq 14900 ]; then - max=5 - elif [ ${hash_type} -eq 15400 ]; then - max=5 elif [ ${hash_type} -eq 15800 ]; then max=5 fi @@ -1332,11 +1467,13 @@ function attack_6() fi - CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask_6[$i]}" + mask=${mask_6[$i]} + + CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask}" echo "> Testing hash type $hash_type with attack mode 6, markov ${MARKOV}, multi hash with word len ${i}." &>> ${OUTD}/logfull.txt - output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask_6[$i]} 2>&1) + output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask} 2>&1) ret=${?} @@ -1414,27 +1551,74 @@ function attack_7() echo "> Testing hash type $hash_type with attack mode 7, markov ${MARKOV}, single hash, Device-Type ${TYPE}, vector-width ${VECTOR}." &>> ${OUTD}/logfull.txt + min=1 max=8 + mask_offset=0 + if [ "${hash_type}" -eq 2500 ]; then max=5 elif [ "${hash_type}" -eq 14000 ]; then - max=5 + mask_offset=4 + min=0 + max=1 elif [ "${hash_type}" -eq 14100 ]; then - max=5 + mask_offset=3 + min=0 + max=1 elif [ "${hash_type}" -eq 14900 ]; then - max=5 + mask_offset=5 + min=0 + max=1 elif [ "${hash_type}" -eq 15400 ]; then - max=5 + mask_offset=3 + min=0 + max=1 elif [ "${hash_type}" -eq 15800 ]; then max=5 fi + # special case: we need to split the first line + + if [ "${min}" -eq 0 ]; then + + pass_part_1=$(sed -n 1p ${OUTD}/${hash_type}_dict1) + pass_part_2=$(sed -n 1p ${OUTD}/${hash_type}_dict2) + + pass="${pass_part_1}${pass_part_2}" + + echo -n ${pass} | cut -b -$((${mask_offset} + 0)) > ${OUTD}/${hash_type}_dict1_custom + echo -n ${pass} | cut -b $((${mask_offset} + 1))- > ${OUTD}/${hash_type}_dict2_custom + + mask_custom="" + + for i in $(seq 1 ${mask_offset}); do + + if [ "${hash_type}" -eq 14000 ]; then + + char=$(echo -n ${pass} | cut -b ${i}) + mask_custom="${mask_custom}${char}" + + elif [ "${hash_type}" -eq 14100 ]; then + + char=$(echo -n ${pass} | cut -b ${i}) + mask_custom="${mask_custom}${char}" + + else + + mask_custom="${mask_custom}?d" + + fi + + done + + fi + i=1 while read -u 9 hash; do - if [ $i -gt 1 ]; then + if [ ${i} -gt ${min} ]; then if [ "${file_only}" -eq 1 ]; then @@ -1450,7 +1634,11 @@ function attack_7() if [ "${hash_type}" -eq 2500 ] || [ "${hash_type}" -eq 15800 ]; then - line_nr=$((i - 1)) + line_nr=1 + + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi pass_part_1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) pass_part_2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) @@ -1470,11 +1658,21 @@ function attack_7() fi - CMD="./${BIN} ${OPTS} -a 7 -m ${hash_type} '${hash}' ${mask} ${OUTD}/${hash_type}_dict2" + dict1=${OUTD}/${hash_type}_dict1 + dict2=${OUTD}/${hash_type}_dict2 + + if [ "${min}" -eq 0 ]; then + mask=${mask_custom} + + dict1=${OUTD}/${hash_type}_dict1_custom + dict2=${OUTD}/${hash_type}_dict2_custom + fi + + CMD="./${BIN} ${OPTS} -a 7 -m ${hash_type} '${hash}' ${mask} ${dict2}" echo -n "[ len $i ] " &>> ${OUTD}/logfull.txt - output=$(./${BIN} ${OPTS} -a 7 -m ${hash_type} "${hash}" ${mask} ${OUTD}/${hash_type}_dict2 2>&1) + output=$(./${BIN} ${OPTS} -a 7 -m ${hash_type} "${hash}" ${mask} ${dict2} 2>&1) ret=${?} @@ -1482,10 +1680,14 @@ function attack_7() if [ "${ret}" -eq 0 ]; then - line_nr=$((i - 1)) + line_nr=1 - line_dict1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) - line_dict2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi + + line_dict1=$(sed -n ${line_nr}p ${dict1}) + line_dict2=$(sed -n ${line_nr}p ${dict2}) if [ ${pass_only} -eq 1 ]; then search=":${line_dict1}${line_dict2}" @@ -1527,11 +1729,26 @@ function attack_7() echo "[ ${OUTD} ] [ Type ${hash_type}, Attack 7, Mode single, Device-Type ${TYPE}, Vector-Width ${VECTOR} ] > $msg : ${e_nf}/${cnt} not found, ${e_nm}/${cnt} not matched, ${e_to}/${cnt} timeout" + rm -f ${OUTD}/${hash_type}_dict1_custom + rm -f ${OUTD}/${hash_type}_dict2_custom + fi # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0